summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGadi Haber <gadi.haber@intel.com>2017-08-28 10:04:16 +0000
committerGadi Haber <gadi.haber@intel.com>2017-08-28 10:04:16 +0000
commitd76f7b824e57b40fb87d8401d36af5b54d304542 (patch)
tree547e4ab05720649f25f1613cbfb6d0c302c5c3fd
parent60608a8ae55041cc47f0458570af897bda8a3123 (diff)
downloadbcm5719-llvm-d76f7b824e57b40fb87d8401d36af5b54d304542.tar.gz
bcm5719-llvm-d76f7b824e57b40fb87d8401d36af5b54d304542.zip
[X86][Haswell] Updating HSW instruction scheduling information
This patch completely replaces the instruction scheduling information for the Haswell architecture target by modifying the file X86SchedHaswell.td located under the X86 Target. We used the scheduling information retrieved from the Haswell architects in order to replace and modify the existing scheduling. The patch continues the scheduling replacement effort started with the SNB target in r307529 and r310792. Information includes latency, number of micro-Ops and used ports by each HSW instruction. Please expect some performance fluctuations due to code alignment effects. Reviewers: RKSimon, zvi, aymanmus, craig.topper, m_zuckerman, igorb, dim, chandlerc, aaboud Differential Revision: https://reviews.llvm.org/D36663 llvm-svn: 311879
-rw-r--r--llvm/lib/Target/X86/X86SchedHaswell.td4679
-rw-r--r--llvm/test/CodeGen/X86/aes-schedule.ll16
-rw-r--r--llvm/test/CodeGen/X86/avx-schedule.ll386
-rw-r--r--llvm/test/CodeGen/X86/avx2-schedule.ll66
-rw-r--r--llvm/test/CodeGen/X86/avx512-cmp.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx512-cvt.ll54
-rw-r--r--llvm/test/CodeGen/X86/avx512-ext.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512-insert-extract.ll47
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512-mask-op.ll554
-rw-r--r--llvm/test/CodeGen/X86/avx512-vec-cmp.ll54
-rw-r--r--llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll1512
-rw-r--r--llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll44
-rw-r--r--llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll16242
-rw-r--r--llvm/test/CodeGen/X86/bmi-schedule.ll56
-rw-r--r--llvm/test/CodeGen/X86/bmi2-schedule.ll60
-rw-r--r--llvm/test/CodeGen/X86/f16c-schedule.ll24
-rw-r--r--llvm/test/CodeGen/X86/lea32-schedule.ll22
-rw-r--r--llvm/test/CodeGen/X86/lea64-schedule.ll22
-rw-r--r--llvm/test/CodeGen/X86/lzcnt-schedule.ll18
-rw-r--r--llvm/test/CodeGen/X86/movbe-schedule.ll10
-rw-r--r--llvm/test/CodeGen/X86/mul-constant-i32.ll178
-rw-r--r--llvm/test/CodeGen/X86/mul-constant-i64.ll136
-rw-r--r--llvm/test/CodeGen/X86/popcnt-schedule.ll12
-rw-r--r--llvm/test/CodeGen/X86/pr32329.ll68
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath.ll192
-rw-r--r--llvm/test/CodeGen/X86/recip-fastmath2.ll392
-rw-r--r--llvm/test/CodeGen/X86/sha-schedule.ll14
-rw-r--r--llvm/test/CodeGen/X86/sse-schedule.ll196
-rw-r--r--llvm/test/CodeGen/X86/sse2-schedule.ll484
-rw-r--r--llvm/test/CodeGen/X86/sse3-schedule.ll46
-rw-r--r--llvm/test/CodeGen/X86/sse41-schedule.ll210
-rw-r--r--llvm/test/CodeGen/X86/sse42-schedule.ll38
-rw-r--r--llvm/test/CodeGen/X86/ssse3-schedule.ll64
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-512.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-256.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-256.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll12
40 files changed, 13484 insertions, 12456 deletions
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 03c8ccb53af..3523601a4bd 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -23,8 +23,8 @@ def HaswellModel : SchedMachineModel {
// Based on the LSD (loop-stream detector) queue size and benchmarking data.
let LoopMicroOpBufferSize = 50;
- // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
- // the scheduler to assign a default model to unrecognized opcodes.
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
let CompleteModel = 0;
}
@@ -436,30 +436,6 @@ def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
// r,m.
def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
-// CMOVcc.
-// r,r.
-def : InstRW<[Write2P0156_Lat2],
- (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
- (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
-
-// XCHG.
-// r,r.
-def WriteXCHG : SchedWriteRes<[HWPort0156]> {
- let Latency = 2;
- let ResourceCycles = [3];
-}
-
-def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
-
-// r,m.
-def WriteXCHGrm : SchedWriteRes<[]> {
- let Latency = 21;
- let NumMicroOps = 8;
-}
-def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
-
// XLAT.
def WriteXLAT : SchedWriteRes<[]> {
let Latency = 7;
@@ -471,12 +447,6 @@ def : InstRW<[WriteXLAT], (instregex "XLAT")>;
// m.
def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
-// PUSHF.
-def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
- let NumMicroOps = 4;
-}
-def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
-
// PUSHA.
def WritePushA : SchedWriteRes<[]> {
let NumMicroOps = 19;
@@ -487,178 +457,14 @@ def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
// m.
def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
-// POPF.
-def WritePopF : SchedWriteRes<[]> {
- let NumMicroOps = 9;
-}
-def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
-
// POPA.
def WritePopA : SchedWriteRes<[]> {
let NumMicroOps = 18;
}
def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
-// LAHF SAHF.
-def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
-
-// BSWAP.
-// r32.
-def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
-def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
-
-// r64.
-def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
-
-// MOVBE.
-// r16,m16 / r64,m64.
-def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
-
-// r32, m32.
-def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
-
-// m16,r16.
-def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
-
-// m32,r32.
-def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
-
-// m64,r64.
-def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
-
//-- Arithmetic instructions --//
-// ADD SUB.
-// m,r/i.
-def : InstRW<[Write2P0156_2P237_P4],
- (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
- "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
-
-// ADC SBB.
-// r,r/i.
-def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
- "(ADC|SBB)(16|32|64)ri8",
- "(ADC|SBB)64ri32",
- "(ADC|SBB)(8|16|32|64)rr_REV")>;
-
-// r,m.
-def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
-
-// m,r/i.
-def : InstRW<[Write3P0156_2P237_P4],
- (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
- "(ADC|SBB)(16|32|64)mi8",
- "(ADC|SBB)64mi32")>;
-
-// INC DEC NOT NEG.
-// m.
-def : InstRW<[WriteP0156_2P237_P4],
- (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
- "(INC|DEC)64(16|32)m")>;
-
-// MUL IMUL.
-// r16.
-def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
-
-// m16.
-def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 5;
-}
-def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
-
-// r32.
-def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
-
-// m32.
-def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
-
-// r64.
-def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
- let Latency = 3;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
-
-// m64.
-def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
- let Latency = 7;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
-
-// r16,r16.
-def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
-
-// r16,m16.
-def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
-
-// MULX.
-// r32,r32,r32.
-def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
- let Latency = 4;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
-
-// r32,r32,m32.
-def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 2, 1];
-}
-def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
-
-// r64,r64,r64.
-def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
-
-// r64,r64,m64.
-def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
-
// DIV.
// r8.
def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
@@ -667,27 +473,6 @@ def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
}
def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
-// r16.
-def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 23;
- let NumMicroOps = 10;
-}
-def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
-
-// r32.
-def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 22;
- let NumMicroOps = 10;
-}
-def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
-
-// r64.
-def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 32;
- let NumMicroOps = 36;
-}
-def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
-
// IDIV.
// r8.
def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
@@ -696,259 +481,23 @@ def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
}
def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
-// r16.
-def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 23;
- let NumMicroOps = 10;
-}
-def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
-
-// r32.
-def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 22;
- let NumMicroOps = 9;
-}
-def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
-
-// r64.
-def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 39;
- let NumMicroOps = 59;
-}
-def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
-
-//-- Logic instructions --//
-
-// AND OR XOR.
-// m,r/i.
-def : InstRW<[Write2P0156_2P237_P4],
- (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
- "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
-
-// SHR SHL SAR.
-// m,i.
-def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
-}
-def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
-
-// r,cl.
-def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
-
-// m,cl.
-def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
- let NumMicroOps = 6;
- let ResourceCycles = [3, 2, 1];
-}
-def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
-
-// ROR ROL.
-// r,1.
-def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
-
-// m,i.
-def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 5;
- let ResourceCycles = [2, 2, 1];
-}
-def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
-
-// r,cl.
-def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
-
-// m,cl.
-def WriteRotateRMWCL : SchedWriteRes<[]> {
- let NumMicroOps = 6;
-}
-def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
-
-// RCR RCL.
-// r,1.
-def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
-}
-def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
-
-// m,1.
-def WriteRCm1 : SchedWriteRes<[]> {
- let NumMicroOps = 6;
-}
-def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
-
-// r,i.
-def WriteRCri : SchedWriteRes<[HWPort0156]> {
- let Latency = 6;
- let NumMicroOps = 8;
-}
-def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
-
-// m,i.
-def WriteRCmi : SchedWriteRes<[]> {
- let NumMicroOps = 11;
-}
-def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
-
-// SHRD SHLD.
-// r,r,i.
-def WriteShDrr : SchedWriteRes<[HWPort1]> {
- let Latency = 3;
-}
-def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
-
-// m,r,i.
-def WriteShDmr : SchedWriteRes<[]> {
- let NumMicroOps = 5;
-}
-def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
-
-// r,r,cl.
-def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
- let Latency = 3;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
-
-// r,r,cl.
-def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
-
-// m,r,cl.
-def WriteShDmrCL : SchedWriteRes<[]> {
- let NumMicroOps = 7;
-}
-def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
-
// BT.
-// r,r/i.
-def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
-
// m,r.
def WriteBTmr : SchedWriteRes<[]> {
let NumMicroOps = 10;
}
def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
-// m,i.
-def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
-
// BTR BTS BTC.
-// r,r,i.
-def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
-
// m,r.
def WriteBTRSCmr : SchedWriteRes<[]> {
let NumMicroOps = 11;
}
def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
-// m,i.
-def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
-
-// BSF BSR.
-// r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
-
-// SETcc.
-// r.
-def : InstRW<[WriteShift],
- (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
-// m.
-def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteSetCCm],
- (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
-
-// CLD STD.
-def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
-
-// LZCNT TZCNT.
-// r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
-
-// ANDN.
-// r,r.
-def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
-
-// BLSI BLSMSK BLSR.
-// r,r.
-def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
-
-// BEXTR.
-// r,r,r.
-def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
-// r,m,r.
-def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
-
-// BZHI.
-// r,r,r.
-def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
-
-// PDEP PEXT.
-// r,r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
-
//-- Control transfer instructions --//
-// J(E|R)CXZ.
-def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
-
-// LOOP.
-def WriteLOOP : SchedWriteRes<[]> {
- let NumMicroOps = 7;
-}
-def : InstRW<[WriteLOOP], (instregex "LOOP")>;
-
-// LOOP(N)E
-def WriteLOOPE : SchedWriteRes<[]> {
- let NumMicroOps = 11;
-}
-def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
-
// CALL.
-// r.
-def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
-
-// m.
-def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
-}
-def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
-
-// RET.
-def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
-
// i.
def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
let NumMicroOps = 4;
@@ -977,12 +526,6 @@ def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
// LODSD/Q.
def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
-// STOS.
-def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
-
// MOVS.
def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
let Latency = 4;
@@ -1002,57 +545,9 @@ def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
}
def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
-//-- Synchronization instructions --//
-
-// XADD.
-def WriteXADD : SchedWriteRes<[]> {
- let NumMicroOps = 5;
-}
-def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
-
-// CMPXCHG.
-def WriteCMPXCHG : SchedWriteRes<[]> {
- let NumMicroOps = 6;
-}
-def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
-
-// CMPXCHG8B.
-def WriteCMPXCHG8B : SchedWriteRes<[]> {
- let NumMicroOps = 15;
-}
-def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
-
-// CMPXCHG16B.
-def WriteCMPXCHG16B : SchedWriteRes<[]> {
- let NumMicroOps = 22;
-}
-def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
-
//-- Other --//
-// PAUSE.
-def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
- let NumMicroOps = 5;
- let ResourceCycles = [1, 3];
-}
-def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
-
-// LEAVE.
-def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
-
-// XGETBV.
-def WriteXGETBV : SchedWriteRes<[]> {
- let NumMicroOps = 8;
-}
-def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
-
-// RDTSC.
-def WriteRDTSC : SchedWriteRes<[]> {
- let NumMicroOps = 15;
-}
-def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
-
-// RDPMC.
+// RDPMC.f
def WriteRDPMC : SchedWriteRes<[]> {
let NumMicroOps = 34;
}
@@ -1072,13 +567,6 @@ def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
// m80.
def : InstRW<[WriteP01], (instregex "LD_Frr")>;
-def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 4;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 2];
-}
-def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
-
// FBLD.
// m80.
def WriteFBLD : SchedWriteRes<[]> {
@@ -1091,84 +579,12 @@ def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
// r.
def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
-// m80.
-def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
- let NumMicroOps = 7;
- let ResourceCycles = [3, 2, 2];
-}
-def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
-
-// FBSTP.
-// m80.
-def WriteFBSTP : SchedWriteRes<[]> {
- let NumMicroOps = 226;
-}
-def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
-
-// FXCHG.
-def : InstRW<[WriteNop], (instregex "XCH_F")>;
-
-// FILD.
-def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 6;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
-
-// FIST(P) FISTTP.
-def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
- let Latency = 7;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
-
// FLDZ.
def : InstRW<[WriteP01], (instregex "LD_F0")>;
-// FLD1.
-def : InstRW<[Write2P01], (instregex "LD_F1")>;
-
// FLDPI FLDL2E etc.
def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
-// FCMOVcc.
-def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
-}
-def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
-
-// FNSTSW.
-// AX.
-def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
-
-// m16.
-def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
- let Latency = 6;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
-
-// FLDCW.
-def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
- let Latency = 7;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
-
-// FNSTCW.
-def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
-
-// FINCSTP FDECSTP.
-def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
-
// FFREE.
def : InstRW<[WriteP01], (instregex "FFREE")>;
@@ -1192,13 +608,6 @@ def : InstRW<[WriteP0], (instregex "ABS_F")>;
// FCHS.
def : InstRW<[WriteP0], (instregex "CHS_F")>;
-// FCOM(P) FUCOM(P).
-// r.
-def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
- "UCOM_FPr")>;
-// m.
-def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
-
// FCOMPP FUCOMPP.
// r.
def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
@@ -1208,9 +617,6 @@ def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
"UCOM_FIPr")>;
-// FICOM(P).
-def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
-
// FTST.
def : InstRW<[WriteP1], (instregex "TST_F")>;
@@ -1272,66 +678,6 @@ def WriteFNINIT : SchedWriteRes<[]> {
def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
//=== Integer MMX and XMM Instructions ===//
-//-- Move instructions --//
-
-// MOVD.
-// r32/64 <- (x)mm.
-def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
- "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
-
-// (x)mm <- r32/64.
-def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
- "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
-
-// MOVQ.
-// r64 <- (x)mm.
-def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
-
-// (x)mm <- r64.
-def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
-
-// (x)mm <- (x)mm.
-def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
-
-// (V)MOVDQA/U.
-// x <- x.
-def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
- "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
- "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
-
-// MOVDQ2Q.
-def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
-
-// MOVQ2DQ.
-def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
-
-
-// PACKSSWB/DW.
-// mm <- mm.
-def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [3];
-}
-def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
- "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
-
-// mm <- m64.
-def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
- let Latency = 4;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 3];
-}
-def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
- "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
-
-// VPMOVSX/ZX BW BD BQ DW DQ.
-// y <- x.
-def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
- let Latency = 3;
- let NumMicroOps = 1;
-}
-def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
// PBLENDW.
// x,x,i / v,v,v,i
@@ -1346,94 +692,12 @@ def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
}
def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
-// VPBLENDD.
-// v,v,v,i.
-def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
-def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
-
-// v,v,m,i
-def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
- let NumMicroOps = 2;
- let Latency = 4;
- let ResourceCycles = [1, 1];
-}
-def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
-
-// MASKMOVQ.
-def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 2];
-}
-def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
-
-// MASKMOVDQU.
-def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
- let Latency = 14;
- let NumMicroOps = 10;
- let ResourceCycles = [4, 2, 4];
-}
-def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
-
-// VPMASKMOV D/Q.
-// v,v,m.
-def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
- let Latency = 4;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
-}
-def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
- (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
-
-// m, v,v.
-def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
-}
-def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
-
// PMOVMSKB.
def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
let Latency = 3;
}
def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
-// PEXTR B/W/D/Q.
-// r32,x,i.
-def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
-}
-def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
-
-// m8,x,i.
-def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
-}
-def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
-
-// VPBROADCAST B/W.
-// x, m8/16.
-def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
-}
-def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
- (instregex "VPBROADCAST(B|W)rm")>;
-
-// y, m8/16
-def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
-}
-def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
- (instregex "VPBROADCAST(B|W)Yrm")>;
-
// VPGATHERDD.
// x.
def WriteVPGATHERDD128 : SchedWriteRes<[]> {
@@ -1521,660 +785,3667 @@ def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
let ResourceCycles = [1, 2, 1];
}
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
- let Latency = 3;
+//=== Floating Point XMM and YMM Instructions ===//
+
+// VGATHERDPS.
+// x.
+def WriteVGATHERDPS128 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+
+// y.
+def WriteVGATHERDPS256 : SchedWriteRes<[]> {
+ let NumMicroOps = 34;
+}
+def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+
+// VGATHERQPS.
+// x.
+def WriteVGATHERQPS128 : SchedWriteRes<[]> {
+ let NumMicroOps = 15;
+}
+def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+
+// y.
+def WriteVGATHERQPS256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+
+// VGATHERDPD.
+// x.
+def WriteVGATHERDPD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 12;
+}
+def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+
+// y.
+def WriteVGATHERDPD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+
+// VGATHERQPD.
+// x.
+def WriteVGATHERQPD128 : SchedWriteRes<[]> {
+ let NumMicroOps = 14;
+}
+def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+
+// y.
+def WriteVGATHERQPD256 : SchedWriteRes<[]> {
+ let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+
+// Remaining instrs.
+
+def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup0], (instregex "LDDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "LD_F32m")>;
+def: InstRW<[HWWriteResGroup0], (instregex "LD_F64m")>;
+def: InstRW<[HWWriteResGroup0], (instregex "LD_F80m")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64to64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVQ64rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOV(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOV64toPQIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOV8rm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVAPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVAPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVNTDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSHDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSLDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSX(16|32|64)rm16")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSX(16|32|64)rm32")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSX(16|32|64)rm8")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVUPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVUPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVZX(16|32|64)rm16")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVZX(16|32|64)rm8")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHNTA")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT0")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT1")>;
+def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT2")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTF128")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTI128")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQAYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQAYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQYrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQrm")>;
+
+def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVNTQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVQ64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV8mi")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV8mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVAPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVAPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVDQAmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVDQUmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVHPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVHPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVLPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVLPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTDQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTI_64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPDI2DImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPQI2QImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPQIto64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVSSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVUPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVUPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "ST_FP32m")>;
+def: InstRW<[HWWriteResGroup1], (instregex "ST_FP64m")>;
+def: InstRW<[HWWriteResGroup1], (instregex "ST_FP80m")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTI128mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVSDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVSSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMPTRSTm")>;
+
+def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64grr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MOVPQIto64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRADYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSrr")>;
+
+def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup3], (instregex "COMP_FST0r")>;
+def: InstRW<[HWWriteResGroup3], (instregex "COM_FST0r")>;
+def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[HWWriteResGroup3], (instregex "UCOM_FPr")>;
+def: InstRW<[HWWriteResGroup3], (instregex "UCOM_Fr")>;
+def: InstRW<[HWWriteResGroup3], (instregex "VMASKMOVDQU")>;
+
+def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup4], (instregex "ANDNPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDNPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "INSERTPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64rr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64to64rr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOV64toPQIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVAPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVAPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVDDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKSSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKUSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKUSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PALIGNRrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PBLENDWrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFHWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFLWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSLLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSRLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "SHUFPDrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "SHUFPSrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDI2PDIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "XORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "XORPSrr")>;
+
+def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup5], (instregex "JMP(16|32|64)r")>;
+
+def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup6], (instregex "FINCSTP")>;
+def: InstRW<[HWWriteResGroup6], (instregex "FNOP")>;
+
+def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "CDQ")>;
+def: InstRW<[HWWriteResGroup7], (instregex "CQO")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JAE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JAE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JA_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JA_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JBE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JBE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JB_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JB_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JGE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JGE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JG_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JG_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JLE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JLE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JL_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JL_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JMP_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JMP_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNO_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNO_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNP_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNP_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNS_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNS_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JO_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JO_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JP_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JP_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JS_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JS_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "RORX32ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "RORX64ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SARX32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SARX64rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETAEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETBr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETGEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETGr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETLEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETLr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNOr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNPr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNSr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETOr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETPr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETSr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHLX32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHLX64rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHRX32rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHRX64rr")>;
+
+def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSI32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSI64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSR32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSR64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BZHI32rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BZHI64rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "LEA(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDQirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXUBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINUBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBQirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PAVGBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PAVGWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNBrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNDrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNWrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWrr")>;
+
+def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PXORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV?)")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDNrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPORYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPXORYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPXORrr")>;
+
+def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CBW")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CLC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>;
+def: InstRW<[HWWriteResGroup10], (instregex "DEC(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "INC(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NEG(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NEG8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOOP")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOT(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SLDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SMSW16m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "STC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "STRm")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8rr")>;
+
+def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup11], (instregex "CVTPS2PDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "CVTSS2SDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRADrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRAWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLDYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLQYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLVQYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLVQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSLLWYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRADYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRAWYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLDYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLQYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLVQYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLVQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VPSRLWYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPDYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPSYrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VTESTPSrm")>;
+
+def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup12], (instregex "FCOM32m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "FCOM64m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "FCOMP32m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "FCOMP64m")>;
+
+def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup13], (instregex "ANDNPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ANDNPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ANDPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ANDPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "INSERTPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PINSRWirmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PSHUFWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKHBWirm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKHDQirm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKHWDirm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKLBWirm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKLDQirm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKLWDirm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MOVHPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MOVHPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MOVLPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "MOVLPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKSSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKSSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKUSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKUSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PALIGNRrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PBLENDWrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PINSRBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PINSRDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PINSRQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PINSRWrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXBDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXBQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXWQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXBDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXBQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXWQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFDmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFHWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFLWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "SHUFPDrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "SHUFPSrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDPDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDPSYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VINSERTPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VMOVHPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VMOVHPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VMOVLPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VMOVLPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VORPDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VORPSYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRYrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWYrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDYmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSYmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPINSRBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPINSRDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPINSRQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPINSRWrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDYmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWYmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWYmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VXORPDYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VXORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VXORPSYrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VXORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "XORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "XORPSrm")>;
+
+def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64")>;
+def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>;
+
+def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup15], (instregex "BT(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup15], (instregex "RORX32mi")>;
+def: InstRW<[HWWriteResGroup15], (instregex "RORX64mi")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SARX32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SARX64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHLX32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHLX64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHRX32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHRX64rm")>;
+
+def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup16], (instregex "ANDN32rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "ANDN64rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSI32rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSI64rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK32rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK64rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSR32rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSR64rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BZHI32rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BZHI64rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDQirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXUBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINUBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBQirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MOVBE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PABSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PABSDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PABSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDQrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDUSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDUSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PADDWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PAVGBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PAVGWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQQrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PCMPGTBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PCMPGTDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PCMPGTWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMAXSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMAXSDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMAXSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMAXUBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMAXUDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMAXUWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMINSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMINSDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMINSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMINUBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMINUDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PMINUWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSIGNBrm128")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSIGNDrm128")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSIGNWrm128")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBQrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBUSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBUSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "PSUBWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPABSBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPABSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPABSDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPABSDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPABSWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPABSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDQYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDQrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDSBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDSWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPADDWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPAVGBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPAVGBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPAVGWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPAVGWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQQYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQQrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINSBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINSDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINSDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINSWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINUBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINUBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINUDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINUDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINUWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPMINUWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNBYrm256")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNBrm128")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNDYrm256")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNDrm128")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNWYrm256")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNWrm128")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBDYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBDrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBQYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBQrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSBYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSBrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSWrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBWYrm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "VPSUBWrm")>;
+
+def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup17], (instregex "BLENDPDrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "BLENDPSrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "MMX_PANDNirm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "MMX_PANDirm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "MMX_PORirm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "MMX_PXORirm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PANDNrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PANDrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PORrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PXORrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSYrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VINSERTI128rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPANDNYrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPANDNrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPANDYrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPANDrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDYrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPORYrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPORrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPXORYrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPXORrm")>;
+
+def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup18], (instregex "ADD(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "ADD8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "AND(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "AND8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP8mi")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP8mr")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "OR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "OR8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr?)")>;
+def: InstRW<[HWWriteResGroup18], (instregex "SUB(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "SUB8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "TEST(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "TEST8mi")>;
+def: InstRW<[HWWriteResGroup18], (instregex "TEST8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "XOR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "XOR8rm")>;
+
+def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup19], (instregex "SFENCE")>;
+
+def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> {
+ let Latency = 1;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup20], (instregex "EXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRBmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRDmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRQmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRWmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "STMXCSR")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VEXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRBmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRDmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRQmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRWmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VSTMXCSR")>;
+
+def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
- "MMX_PHADDSWrr64",
- "MMX_PHSUB(W|D)rr64",
- "MMX_PHSUBSWrr64",
- "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
- "(V?)PH(ADD|SUB)SWrr(256)?")>;
+def: InstRW<[HWWriteResGroup21], (instregex "FNSTCW16m")>;
-// v <- v,m.
-def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
- let Latency = 6;
+def HWWriteResGroup22 : SchedWriteRes<[HWPort4,HWPort237,HWPort06]> {
+ let Latency = 1;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2, 1];
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup22], (instregex "SETAEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETBm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETGEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETGm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETLEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETLm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNOm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNPm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNSm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETOm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETPm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETSm")>;
+
+def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WritePHADDSUBm, ReadAfterLd],
- (instregex "MMX_PHADD(W?)rm64",
- "MMX_PHADDSWrm64",
- "MMX_PHSUB(W|D)rm64",
- "MMX_PHSUBSWrm64",
- "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
- "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
+def: InstRW<[HWWriteResGroup23], (instregex "MOVBE(32|64)mr")>;
-// PCMPGTQ.
-// v <- v,v.
-def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
- let Latency = 5;
- let NumMicroOps = 1;
+def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+def: InstRW<[HWWriteResGroup23_16], (instregex "MOVBE16mr")>;
-// v <- v,m.
-def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 5;
+def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr?)")>;
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH64i8")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSB")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSL")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSQ")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSW")>;
+
+def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "BTR(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "BTS(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR8m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL8m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR8m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR8mi")>;
+
+def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "ADD8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "ADD8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "DEC(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "DEC8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "INC(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "INC8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NEG(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NEG8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NOT(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NOT8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR8mr")>;
+
+def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> {
+ let Latency = 2;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [2];
+}
+def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPDrr0")>;
+def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPSrr0")>;
+def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWirri")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PBLENDVBrr0")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRBrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRDrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRQrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRWrri")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBYrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRBrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRDrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRQrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRWrri")>;
+
+def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
}
-def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
+def: InstRW<[HWWriteResGroup28], (instregex "FDECSTP")>;
-// PMULLD.
-// x,x / y,y,y.
-def WritePMULLDr : SchedWriteRes<[HWPort0]> {
- let Latency = 10;
+def HWWriteResGroup29 : SchedWriteRes<[HWPort06]> {
+ let Latency = 2;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL8r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL8ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR8r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR8ri")>;
-// x,m / y,y,m.
-def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[HWWriteResGroup30], (instregex "LFENCE")>;
+def: InstRW<[HWWriteResGroup30], (instregex "MFENCE")>;
+def: InstRW<[HWWriteResGroup30], (instregex "WAIT")>;
+def: InstRW<[HWWriteResGroup30], (instregex "XGETBV")>;
+
+def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup31], (instregex "CVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "CVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "EXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "MMX_PEXTRWirri")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRBrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWri")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWrr_REV")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSLLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSLLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSLLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRADrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRAWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PTESTrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRBrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWri")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWrr_REV")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSLLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSLLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSLLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRADrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRAWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPTESTrr")>;
+
+def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
+def: InstRW<[HWWriteResGroup32], (instregex "CLFLUSH")>;
-//-- Logic instructions --//
+def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>;
-// PTEST.
-// v,v.
-def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
+def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> {
let Latency = 2;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "BEXTR32rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "BEXTR64rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>;
-// v,m.
-def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
- let Latency = 6;
+def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC8i8")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC8ri")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVAE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVB(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVG(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVGE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVL(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVLE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNO(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNP(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNS(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVO(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVP(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVS(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CWD")>;
+def: InstRW<[HWWriteResGroup35], (instregex "JRCXZ")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB8i8")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB8ri")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV?)")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SETAr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SETBEr")>;
+
+def HWWriteResGroup36 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 2;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPDrm0")>;
+def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPSrm0")>;
+def: InstRW<[HWWriteResGroup36], (instregex "MMX_PACKSSDWirm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "MMX_PACKSSWBirm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "MMX_PACKUSWBirm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "PBLENDVBrm0")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSYrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBYrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDYrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQYrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQrm")>;
+
+def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
}
-def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "LEAVE64")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASB")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASL")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASQ")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASW")>;
-// PSLL,PSRL,PSRA W/D/Q.
-// x,x / v,v,x.
-def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
+def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup38], (instregex "PSLLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSLLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSLLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRADrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRAWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PTESTrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSLLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSLLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSLLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRADrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRAWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPTESTrm")>;
+
+def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
+def: InstRW<[HWWriteResGroup39], (instregex "FLDCW16m")>;
-// PSLL,PSRL DQ.
-def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
+def HWWriteResGroup40 : SchedWriteRes<[HWPort0,HWPort23,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup40], (instregex "LDMXCSR")>;
+def: InstRW<[HWWriteResGroup40], (instregex "VLDMXCSR")>;
-//-- Other --//
+def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup41], (instregex "LRETQ")>;
+def: InstRW<[HWWriteResGroup41], (instregex "RETQ")>;
-// EMMS.
-def WriteEMMS : SchedWriteRes<[]> {
- let Latency = 13;
- let NumMicroOps = 31;
+def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
+def: InstRW<[HWWriteResGroup42], (instregex "BEXTR32rm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "BEXTR64rm")>;
-//=== Floating Point XMM and YMM Instructions ===//
-//-- Move instructions --//
+def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup43], (instregex "ADC(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "ADC8rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVAE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVB(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVG(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVGE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVL(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVLE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNO(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNP(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNS(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVO(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVP(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVS(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "SBB(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "SBB8rm")>;
+
+def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup44], (instregex "CALL(16|32|64)r")>;
-// MOVMSKP S/D.
-// r32 <- x.
-def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
- let Latency = 3;
+def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup45], (instregex "CALL64pcrel32")>;
+def: InstRW<[HWWriteResGroup45], (instregex "SETAm")>;
+def: InstRW<[HWWriteResGroup45], (instregex "SETBEm")>;
+
+def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL8m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL8mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR8m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR8mi")>;
+
+def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
}
-def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+def: InstRW<[HWWriteResGroup47], (instregex "XADD(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup47], (instregex "XADD8rm")>;
-// r32 <- y.
-def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
+def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
let Latency = 2;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup48], (instregex "FARCALL64")>;
+
+def HWWriteResGroup49 : SchedWriteRes<[HWPort0]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPDrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPSrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "PMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBrr")>;
+
+def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup50], (instregex "ADDPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADD_FPrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADD_FST0r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADD_FrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "BSF(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "BSR(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPPDrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPPSrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "COMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "COMISSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8?)")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL8r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAXSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MINSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MUL8r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PDEP32rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PDEP64rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PEXT32rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PEXT64rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "POPCNT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SHLD(16|32|64)rri8")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SHRD(16|32|64)rri8")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FPrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FST0r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUB_FPrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUB_FST0r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUB_FrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "TZCNT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "UCOMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "UCOMISSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDYrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSYrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCOMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCOMISSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAXSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMINSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrr")>;
+
+def HWWriteResGroup50_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
}
-def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
+def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8?)")>;
-// VPERM2F128.
-def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
-def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8?)")>;
-// BLENDVP S/D.
-def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
-def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSSYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTI128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VINSERTF128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VINSERTI128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERM2F128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERM2I128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMPDYri")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMPSYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMQYri")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBWYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXDQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBWYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXDQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWQYrr")>;
+
+def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup52], (instregex "ADDPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADD_F32m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADD_F64m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "BSF(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "BSR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CMPPDrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CMPPSrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CMPSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "COMISDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "COMISSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CVTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ILD_F16m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ILD_F32m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ILD_F64m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "IMUL64m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "IMUL64rm(i8?)")>;
+def: InstRW<[HWWriteResGroup52], (instregex "IMUL8m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "LZCNT(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MAXPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MAXPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MAXSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MAXSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MINPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MINPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MINSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MINSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MUL64m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MUL8m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "PDEP32rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "PDEP64rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "PEXT32rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "PEXT64rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "POPCNT(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBR_F32m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBR_F64m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUB_F32m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUB_F64m")>;
+def: InstRW<[HWWriteResGroup52], (instregex "TZCNT(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "UCOMISDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "UCOMISSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDPDYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDPSYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDYrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSYrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCOMISDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCOMISSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAXPDYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAXPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAXPSYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAXPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAXSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAXSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMINPDYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMINPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMINPSYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMINPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMINSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMINSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBSDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBSSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VUCOMISDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VUCOMISSrm")>;
+
+def HWWriteResGroup52_16 : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def: InstRW<[HWWriteResGroup52_16], (instregex "IMUL16m")>;
+def: InstRW<[HWWriteResGroup52_16], (instregex "IMUL16rm(i8?)")>;
+def: InstRW<[HWWriteResGroup52_16], (instregex "MUL16m")>;
-// VBROADCASTF128.
-def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
+def HWWriteResGroup52_32 : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def: InstRW<[HWWriteResGroup52_32], (instregex "IMUL32m")>;
+def: InstRW<[HWWriteResGroup52_32], (instregex "IMUL32rm(i8?)")>;
+def: InstRW<[HWWriteResGroup52_32], (instregex "MUL32m")>;
-// EXTRACTPS.
-// r32,x,i.
-def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
+def HWWriteResGroup53 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 3;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERM2I128rm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMDYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMPDYmi")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMPSYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMQYmi")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXBDYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXBQYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXBWYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXDQYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXWDYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXWQYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBQYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBWYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXDQYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWDYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWQYrm")>;
+
+def HWWriteResGroup54 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
}
-def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "XADD(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "XADD8rr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "XCHG8rr")>;
-// m32,x,i.
-def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
- let Latency = 4;
+def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 3;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDYrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDYrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDYrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDrr")>;
+
+def HWWriteResGroup56 : SchedWriteRes<[HWPort5,HWPort15]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHADDDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHADDSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHADDWrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHSUBDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHSUBSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHSUBWrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr256")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr256")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWrr")>;
+
+def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSWBirr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKUSWBirr")>;
-// VEXTRACTF128.
-// x,y,i.
-def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
+def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup58], (instregex "CLD")>;
-// m128,y,i.
-def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
- let Latency = 4;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup59], (instregex "CMOVA(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup59], (instregex "CMOVBE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL8r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL8ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR8r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR8ri")>;
+
+def HWWriteResGroup60 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup60], (instregex "ROL(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROL8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROR8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SAR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SAR8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHL(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHL8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHR8rCL")>;
+
+def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup61], (instregex "FNSTSWm")>;
+
+def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP16m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP32m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP64m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_F16m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_F32m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_FP16m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_FP32m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_FP64m")>;
+
+def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDrm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDYrm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDrm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDYrm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDrm")>;
+
+def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBSWrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBWrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "PHADDDrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "PHADDSWrm128")>;
+def: InstRW<[HWWriteResGroup64], (instregex "PHADDWrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "PHSUBDrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "PHSUBSWrm128")>;
+def: InstRW<[HWWriteResGroup64], (instregex "PHSUBWrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHADDDYrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHADDDrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHADDSWrm128")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHADDSWrm256")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHADDWYrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHADDWrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBDYrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBDrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBSWrm128")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBSWrm256")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBWYrm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBWrm")>;
+
+def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CMOVA(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CMOVBE(16|32|64)rm")>;
-// VINSERTF128.
-// y,y,x,i.
-def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL8m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL8mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR8m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR8mi")>;
+
+def HWWriteResGroup67 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
+}
+def: InstRW<[HWWriteResGroup67], (instregex "ROR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup67], (instregex "ROR8mCL")>;
-// y,y,m128,i.
-def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
+def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[HWWriteResGroup68], (instregex "ADC(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup68], (instregex "ADC8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "ADD8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "AND8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "OR8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "SUB8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XCHG(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XCHG8rm")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XOR8mi")>;
+
+def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[HWWriteResGroup69], (instregex "ADC(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "ADC8mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG8rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "ROL(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "ROL8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SAR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SAR8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB8mi")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB8mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHL(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHL8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHR8mCL")>;
+
+def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> {
let Latency = 4;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SIrr")>;
+
+def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSLLDYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSLLQYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSLLWYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRADYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRAWYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRLDYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRLQYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRLWYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPTESTYrr")>;
+
+def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+def: InstRW<[HWWriteResGroup72], (instregex "FNSTSW16r")>;
-// VMASKMOVP S/D.
-// v,v,m.
-def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
+def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> {
let Latency = 4;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup73], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2PSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSD2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SD64rr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTPS2PHrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSD2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SD64rr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTTPD2DQrr")>;
+
+def HWWriteResGroup74 : SchedWriteRes<[HWPort1,HWPort6]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup74], (instregex "IMUL64r")>;
+def: InstRW<[HWWriteResGroup74], (instregex "MUL64r")>;
+def: InstRW<[HWWriteResGroup74], (instregex "MULX64rr")>;
-// m128,x,x.
-def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
- let Latency = 13;
+def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
}
-def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+def: InstRW<[HWWriteResGroup74_16], (instregex "IMUL16r")>;
+def: InstRW<[HWWriteResGroup74_16], (instregex "MUL16r")>;
-// m256,y,y.
-def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
- let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+def HWWriteResGroup74_32 : SchedWriteRes<[HWPort1,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
}
-def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+def: InstRW<[HWWriteResGroup74_32], (instregex "IMUL32r")>;
+def: InstRW<[HWWriteResGroup74_32], (instregex "MUL32r")>;
-// VGATHERDPS.
-// x.
-def WriteVGATHERDPS128 : SchedWriteRes<[]> {
- let NumMicroOps = 20;
+def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOM16m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOM32m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOMP16m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOMP32m")>;
-// y.
-def WriteVGATHERDPS256 : SchedWriteRes<[]> {
- let NumMicroOps = 34;
+def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SIrm")>;
+
+def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VCVTPS2PDYrm")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VPTESTYrm")>;
-// VGATHERQPS.
-// x.
-def WriteVGATHERQPS128 : SchedWriteRes<[]> {
- let NumMicroOps = 15;
+def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup78], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2DQrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTSD2SSrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "VCVTSD2SSrm")>;
+
+def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+def: InstRW<[HWWriteResGroup79], (instregex "MULX64rm")>;
-// y.
-def WriteVGATHERQPS256 : SchedWriteRes<[]> {
- let NumMicroOps = 22;
+def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBYrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWYrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWrm")>;
-// VGATHERDPD.
-// x.
-def WriteVGATHERDPD128 : SchedWriteRes<[]> {
- let NumMicroOps = 12;
+def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
}
-def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+def: InstRW<[HWWriteResGroup81], (instregex "FNCLEX")>;
-// y.
-def WriteVGATHERDPD256 : SchedWriteRes<[]> {
- let NumMicroOps = 20;
+def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
}
-def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VZEROUPPER")>;
-// VGATHERQPD.
-// x.
-def WriteVGATHERQPD128 : SchedWriteRes<[]> {
- let NumMicroOps = 14;
+def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>;
-// y.
-def WriteVGATHERQPD256 : SchedWriteRes<[]> {
- let NumMicroOps = 22;
+def HWWriteResGroup84 : SchedWriteRes<[HWPort0,HWPort4,HWPort237,HWPort15]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQmr")>;
+
+def HWWriteResGroup85 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+def: InstRW<[HWWriteResGroup85], (instregex "VCVTPS2PHmr")>;
-//-- Conversion instructions --//
+def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8")>;
+def: InstRW<[HWWriteResGroup86], (instregex "SHRD(16|32|64)mri8")>;
-// CVTPD2PS.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
+def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup87], (instregex "LSL(16|32|64)rm")>;
-// x,m128.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[HWWriteResGroup88], (instregex "PUSHF16")>;
+def: InstRW<[HWWriteResGroup88], (instregex "PUSHF64")>;
-// x,y.
-def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
+def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDWDirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHUWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULLWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PSADBWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_FPrST0")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_FST0r")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_FrST0")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PCMPGTQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMADDUBSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMADDWDrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULHRSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULHUWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULHWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULLWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULUDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PSADBWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RCPPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RCPSSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RSQRTPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RSQRTSSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRCPPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRCPSSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTSSr")>;
+
+def HWWriteResGroup90 : SchedWriteRes<[HWPort01]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup90], (instregex "MULPDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "MULPSrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "MULSDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "MULSSrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PDYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PSYr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231SDr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231SSr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPDYrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPSYrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPSrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULSDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULSSrr")>;
+
+def HWWriteResGroup91 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 5;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDWDirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHUWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULLWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PSADBWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MUL_F32m")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MUL_F64m")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PCMPGTQrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMADDUBSWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMADDWDrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMULDQrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMULHRSWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMULHUWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMULHWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMULLWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PMULUDQrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "PSADBWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "RCPPSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "RCPSSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "RSQRTPSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "RSQRTSSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPCMPGTQYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPCMPGTQrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMADDUBSWYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMADDWDYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMADDWDrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULDQYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULDQrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULHRSWYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULHRSWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULHUWYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULHUWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULHWYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULHWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULLWYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULLWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULUDQYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPMULUDQrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPSADBWYrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VPSADBWrm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VRCPPSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VRCPSSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTPSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTSSm")>;
+
+def HWWriteResGroup92 : SchedWriteRes<[HWPort01,HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup92], (instregex "MULPDrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "MULPSrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "MULSDrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "MULSSrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PDYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PSYm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231SDm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231SSm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULPDYrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULPDrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULPSYrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULPSrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULSDrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULSSrm")>;
+
+def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup93], (instregex "CVTSI2SS64rr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HADDPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HADDPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HSUBPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HSUBPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VCVTSI2SS64rr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSrr")>;
+
+def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>;
-// x,m256.
-def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
- let Latency = 9;
+def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 5;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+def: InstRW<[HWWriteResGroup95], (instregex "MULX32rr")>;
-// CVTSD2SS.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+def HWWriteResGroup96 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[HWWriteResGroup96], (instregex "HADDPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "HADDPSrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "HSUBPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "HSUBPSrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDYrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSYrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSrm")>;
+
+def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>;
-// x,m64.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup98], (instregex "MULX32rm")>;
-// CVTPS2PD.
-// x,x.
-def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
}
-def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup99], (instregex "PAUSE")>;
-// x,m64.
-// y,m128.
-def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
}
-def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+def: InstRW<[HWWriteResGroup100], (instregex "XSETBV")>;
-// y,x.
-def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
+def HWWriteResGroup101 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,3];
+}
+def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG8rr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "ROUNDPDr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "ROUNDPSr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "ROUNDSDr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "ROUNDSSr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VROUNDPDr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VROUNDPSr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VROUNDSDr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VROUNDSSr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VROUNDYPDr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "VROUNDYPSr")>;
+
+def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 6;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTPS2PHYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTTPD2DQYrr")>;
-// CVTSS2SD.
-// x,x.
-def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI16m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI32m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "ROUNDPDm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "ROUNDPSm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "ROUNDSDm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "ROUNDSSm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI16m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI32m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI16m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI32m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDPDm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDPSm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDSDm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDSSm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPDm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPSm")>;
+
+def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup104], (instregex "VCVTDQ2PDYrm")>;
-// x,m32.
-def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 5;
+def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[HWWriteResGroup105], (instregex "SHRD(16|32|64)rrCL")>;
+
+def HWWriteResGroup106 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup106], (instregex "VCVTPS2PHYmr")>;
+
+def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup107], (instregex "SLDT(16|32|64)r")>;
+
+def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[HWWriteResGroup108], (instregex "STD")>;
+
+def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL")>;
+def: InstRW<[HWWriteResGroup109], (instregex "SHRD(16|32|64)mrCL")>;
+
+def HWWriteResGroup110 : SchedWriteRes<[HWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup110], (instregex "AESDECLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "AESDECrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "AESENCLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "AESENCrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESDECLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESDECrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESENCLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESENCrr")>;
+
+def HWWriteResGroup111 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 7;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup111], (instregex "AESDECLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "AESDECrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "AESENCLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "AESENCrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESDECLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESDECrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESENCLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESENCrm")>;
+
+def HWWriteResGroup112 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
}
-def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
-
-// CVTDQ2PD.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
-
-// y,x.
-def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
-
-// CVT(T)PD2DQ.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
-// x,m128.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
-// x,y.
-def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
-// x,m256.
-def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
-
-// CVT(T)PS2PI.
-// mm,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
-
-// CVTPI2PD.
-// x,mm.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
-
-// CVT(T)PD2PI.
-// mm,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
-
-// CVSTSI2SS.
-// x,r32.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
-
-// CVT(T)SS2SI.
-// r32,x.
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
-
-// CVTSI2SD.
-// x,r32/64.
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
-
-// CVTSD2SI.
-// r32/64
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
-
-// VCVTPS2PH.
-// x,v,i.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
-// m,v,i.
-def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
-
-// VCVTPH2PS.
-// v,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+def: InstRW<[HWWriteResGroup112], (instregex "MPSADBWrri")>;
+def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWYrri")>;
+def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWrri")>;
-//-- Arithmetic instructions --//
+def HWWriteResGroup113 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[HWWriteResGroup113], (instregex "MPSADBWrmi")>;
+def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWYrmi")>;
+def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWrmi")>;
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
- let Latency = 5;
+def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,1,2];
+}
+def: InstRW<[HWWriteResGroup114], (instregex "LOOP")>;
+
+def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 8;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI16m")>;
+def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI32m")>;
-// x,m / v,v,m.
-def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+def HWWriteResGroup116 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup116], (instregex "DPPDrri")>;
+def: InstRW<[HWWriteResGroup116], (instregex "VDPPDrri")>;
+
+def HWWriteResGroup117 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
let Latency = 9;
let NumMicroOps = 4;
- let ResourceCycles = [1, 2, 1];
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup117], (instregex "DPPDrmi")>;
+def: InstRW<[HWWriteResGroup117], (instregex "VDPPDrmi")>;
-// MULL SS/SD PS/PD.
-// x,x / v,v,v.
-def WriteMULr : SchedWriteRes<[HWPort01]> {
- let Latency = 5;
+def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
}
-def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+def: InstRW<[HWWriteResGroup118], (instregex "PMULLDrr")>;
+def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDYrr")>;
+def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDrr")>;
-// x,m / v,v,m.
-def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 9;
+def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup119], (instregex "PMULLDrm")>;
+def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDYrm")>;
+def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDrm")>;
+
+def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,4,1,2];
+}
+def: InstRW<[HWWriteResGroup120], (instregex "RCL(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup120], (instregex "RCL8mCL")>;
+
+def HWWriteResGroup121 : SchedWriteRes<[HWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup121], (instregex "DIVPSrr")>;
+def: InstRW<[HWWriteResGroup121], (instregex "DIVSSrr")>;
+
+def HWWriteResGroup122 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 11;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+def: InstRW<[HWWriteResGroup122], (instregex "DIVPSrm")>;
+def: InstRW<[HWWriteResGroup122], (instregex "DIVSSrm")>;
-// VDIVPS.
-// y,y,y.
-def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 19; // 18-21 cycles.
+def HWWriteResGroup123 : SchedWriteRes<[HWPort0]> {
+ let Latency = 11;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [3];
}
-def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRIrr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRM128rr")>;
-// y,y,m256.
-def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 23; // 18-21 + 4 cycles.
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup124 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+def: InstRW<[HWWriteResGroup124], (instregex "PCLMULQDQrr")>;
+def: InstRW<[HWWriteResGroup124], (instregex "VPCLMULQDQrr")>;
-// VDIVPD.
-// y,y,y.
-def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 27; // 19-35 cycles.
+def HWWriteResGroup125 : SchedWriteRes<[HWPort0,HWPort015]> {
+ let Latency = 11;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+def: InstRW<[HWWriteResGroup125], (instregex "VRCPPSYr")>;
+def: InstRW<[HWWriteResGroup125], (instregex "VRSQRTPSYr")>;
-// y,y,m256.
-def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 31; // 19-35 + 4 cycles.
+def HWWriteResGroup126 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 11;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [3,1];
}
-def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRIrm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRM128rm")>;
-// VRCPPS.
-// y,y.
-def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup127 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+def: InstRW<[HWWriteResGroup127], (instregex "PCLMULQDQrm")>;
+def: InstRW<[HWWriteResGroup127], (instregex "VPCLMULQDQrm")>;
-// y,m256.
-def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+def HWWriteResGroup128 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
let Latency = 11;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+def: InstRW<[HWWriteResGroup128], (instregex "VRCPPSYm")>;
+def: InstRW<[HWWriteResGroup128], (instregex "VRSQRTPSYm")>;
-// ROUND SS/SD PS/PD.
-// v,v,i.
-def WriteROUNDr : SchedWriteRes<[HWPort1]> {
- let Latency = 6;
+def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,3];
+}
+def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup129], (instregex "RCR(16|32|64)rCL")>;
+
+def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,4,1,3];
+}
+def: InstRW<[HWWriteResGroup130], (instregex "RCL8rCL")>;
+
+def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[HWWriteResGroup131], (instregex "LOOPE")>;
+def: InstRW<[HWWriteResGroup131], (instregex "LOOPNE")>;
+
+def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[HWWriteResGroup132], (instregex "CMPXCHG8B")>;
+
+def HWWriteResGroup133 : SchedWriteRes<[HWPort0]> {
+ let Latency = 13;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup133], (instregex "SQRTPSr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "SQRTSSr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "VDIVPSrr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "VDIVSSrr")>;
+
+def HWWriteResGroup134 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup134], (instregex "SQRTPSm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "SQRTSSm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VDIVPSrm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VDIVSSrm")>;
+
+def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 13;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,3,1,3];
+}
+def: InstRW<[HWWriteResGroup135], (instregex "RCR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup135], (instregex "RCR8mCL")>;
+
+def HWWriteResGroup136 : SchedWriteRes<[HWPort0]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup136], (instregex "DIVPDrr")>;
+def: InstRW<[HWWriteResGroup136], (instregex "DIVSDrr")>;
+def: InstRW<[HWWriteResGroup136], (instregex "VSQRTPSr")>;
+def: InstRW<[HWWriteResGroup136], (instregex "VSQRTSSr")>;
+
+def HWWriteResGroup137 : SchedWriteRes<[HWPort5]> {
+ let Latency = 14;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+def: InstRW<[HWWriteResGroup137], (instregex "AESIMCrr")>;
+def: InstRW<[HWWriteResGroup137], (instregex "VAESIMCrr")>;
-// v,m,i.
-def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
- let Latency = 10;
+def HWWriteResGroup138 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup138], (instregex "DIVPDrm")>;
+def: InstRW<[HWWriteResGroup138], (instregex "DIVSDrm")>;
+def: InstRW<[HWWriteResGroup138], (instregex "VSQRTPSm")>;
+def: InstRW<[HWWriteResGroup138], (instregex "VSQRTSSm")>;
+
+def HWWriteResGroup139 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 14;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+def: InstRW<[HWWriteResGroup139], (instregex "AESIMCrm")>;
+def: InstRW<[HWWriteResGroup139], (instregex "VAESIMCrm")>;
-// DPPS.
-// x,x,i / v,v,v,i.
-def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+def HWWriteResGroup140 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
let Latency = 14;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
+def: InstRW<[HWWriteResGroup140], (instregex "DPPSrri")>;
+def: InstRW<[HWWriteResGroup140], (instregex "VDPPSYrri")>;
+def: InstRW<[HWWriteResGroup140], (instregex "VDPPSrri")>;
-// x,m,i / v,v,m,i.
-def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
+def HWWriteResGroup141 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,1,1,1];
+}
+def: InstRW<[HWWriteResGroup141], (instregex "DPPSrmi")>;
+def: InstRW<[HWWriteResGroup141], (instregex "VDPPSYrmi")>;
+def: InstRW<[HWWriteResGroup141], (instregex "VDPPSrmi")>;
+
+def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,3,1,4];
+}
+def: InstRW<[HWWriteResGroup142], (instregex "RCR8rCL")>;
+
+def HWWriteResGroup143 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 15;
+ let ResourceCycles = [1,14];
+}
+def: InstRW<[HWWriteResGroup143], (instregex "POPF16")>;
+
+def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 15;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup144], (instregex "INSB")>;
+def: InstRW<[HWWriteResGroup144], (instregex "INSL")>;
+def: InstRW<[HWWriteResGroup144], (instregex "INSW")>;
+
+def HWWriteResGroup145 : SchedWriteRes<[HWPort5]> {
+ let Latency = 16;
+ let NumMicroOps = 16;
+ let ResourceCycles = [16];
+}
+def: InstRW<[HWWriteResGroup145], (instregex "VZEROALL")>;
+
+def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[HWWriteResGroup146], (instregex "CMPXCHG16B")>;
+
+def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 15;
+ let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[HWWriteResGroup147], (instregex "XCH_F")>;
+
+def HWWriteResGroup148 : SchedWriteRes<[HWPort0,HWPort5,HWPort0156]> {
let Latency = 18;
- let NumMicroOps = 6;
- let ResourceCycles = [2, 1, 1, 1, 1];
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
}
-def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+def: InstRW<[HWWriteResGroup148], (instregex "PCMPESTRIrr")>;
+def: InstRW<[HWWriteResGroup148], (instregex "VPCMPESTRIrr")>;
-// DPPD.
-// x,x,i.
-def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
}
-def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+def: InstRW<[HWWriteResGroup149], (instregex "CPUID")>;
+def: InstRW<[HWWriteResGroup149], (instregex "RDTSC")>;
-// x,m,i.
-def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+def HWWriteResGroup150 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
-def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+def: InstRW<[HWWriteResGroup150], (instregex "PCMPESTRIrm")>;
+def: InstRW<[HWWriteResGroup150], (instregex "VPCMPESTRIrm")>;
-// VFMADD.
-// v,v,v.
-def WriteFMADDr : SchedWriteRes<[HWPort01]> {
- let Latency = 5;
+def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 19;
+ let ResourceCycles = [3,1,15];
+}
+def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64?)")>;
+
+def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[HWWriteResGroup152], (instregex "PCMPESTRM128rr")>;
+def: InstRW<[HWWriteResGroup152], (instregex "VPCMPESTRM128rr")>;
+
+def HWWriteResGroup153 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015,HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+def: InstRW<[HWWriteResGroup153], (instregex "PCMPESTRM128rm")>;
+def: InstRW<[HWWriteResGroup153], (instregex "VPCMPESTRM128rm")>;
+
+def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> {
+ let Latency = 20;
let NumMicroOps = 1;
+ let ResourceCycles = [1];
}
-def : InstRW<[WriteFMADDr],
- (instregex
- // 3p forms.
- "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
- // 3s forms.
- "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r",
- // 4s/4s_int forms.
- "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
- // 4p forms.
- "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
-
-// v,v,m.
-def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 9;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_FPrST0")>;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_FST0r")>;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_FrST0")>;
+def: InstRW<[HWWriteResGroup154], (instregex "SQRTPDr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "SQRTSDr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "VDIVPDrr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "VDIVSDrr")>;
+
+def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 20;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteFMADDm],
- (instregex
- // 3p forms.
- "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
- // 3s forms.
- "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m",
- // 4s/4s_int forms.
- "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
- // 4p forms.
- "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F32m")>;
+def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F64m")>;
+def: InstRW<[HWWriteResGroup155], (instregex "SQRTPDm")>;
+def: InstRW<[HWWriteResGroup155], (instregex "SQRTSDm")>;
+def: InstRW<[HWWriteResGroup155], (instregex "VDIVPDrm")>;
+def: InstRW<[HWWriteResGroup155], (instregex "VDIVSDrm")>;
-//-- Math instructions --//
+def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,2,7];
+}
+def: InstRW<[HWWriteResGroup156], (instregex "MWAITrr")>;
-// VSQRTPS.
-// y,y.
-def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 19;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup157 : SchedWriteRes<[HWPort0]> {
+ let Latency = 21;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
}
-def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+def: InstRW<[HWWriteResGroup157], (instregex "VSQRTPDr")>;
+def: InstRW<[HWWriteResGroup157], (instregex "VSQRTSDr")>;
-// y,m256.
-def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 23;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup158 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+def: InstRW<[HWWriteResGroup158], (instregex "VSQRTPDm")>;
+def: InstRW<[HWWriteResGroup158], (instregex "VSQRTSDm")>;
-// VSQRTPD.
-// y,y.
-def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 28;
+def HWWriteResGroup159 : SchedWriteRes<[HWPort0,HWPort015]> {
+ let Latency = 21;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+def: InstRW<[HWWriteResGroup159], (instregex "VDIVPSYrr")>;
+def: InstRW<[HWWriteResGroup159], (instregex "VSQRTPSYr")>;
-// y,m256.
-def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 32;
+def HWWriteResGroup160 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+ let Latency = 21;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+def: InstRW<[HWWriteResGroup160], (instregex "VDIVPSYrm")>;
+def: InstRW<[HWWriteResGroup160], (instregex "VSQRTPSYm")>;
-// RSQRT SS/PS.
-// x,x.
-def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
- let Latency = 5;
+def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 23;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI16m")>;
+def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI32m")>;
-// x,m128.
-def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 9;
+def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> {
+ let Latency = 24;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FPrST0")>;
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FST0r")>;
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FrST0")>;
+
+def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 24;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+def: InstRW<[HWWriteResGroup163], (instregex "DIV_F32m")>;
+def: InstRW<[HWWriteResGroup163], (instregex "DIV_F64m")>;
-// RSQRTPS 256.
-// y,y.
-def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 27;
+ let ResourceCycles = [1,5,1,1,19];
}
-def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+def: InstRW<[HWWriteResGroup164], (instregex "XSAVE64")>;
-// y,m256.
-def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 28;
+ let ResourceCycles = [1,6,1,1,19];
}
-def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
+def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT?)")>;
-//-- Logic instructions --//
+def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI16m")>;
+def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI32m")>;
-// AND, ANDN, OR, XOR PS/PD.
-// x,x / v,v,v.
-def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
-// x,m / v,v,m.
-def : InstRW<[WriteP5Ld, ReadAfterLd],
- (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+def HWWriteResGroup167 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015]> {
+ let Latency = 28;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,1,1];
+}
+def: InstRW<[HWWriteResGroup167], (instregex "AESKEYGENASSIST128rm")>;
+def: InstRW<[HWWriteResGroup167], (instregex "VAESKEYGENASSIST128rm")>;
-//-- Other instructions --//
+def HWWriteResGroup168 : SchedWriteRes<[HWPort0,HWPort5,HWPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,2];
+}
+def: InstRW<[HWWriteResGroup168], (instregex "AESKEYGENASSIST128rr")>;
+def: InstRW<[HWWriteResGroup168], (instregex "VAESKEYGENASSIST128rr")>;
-// VZEROUPPER.
-def WriteVZEROUPPER : SchedWriteRes<[]> {
- let NumMicroOps = 4;
+def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 30;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
}
-def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN32ri")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN32rr")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN8ri")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN8rr")>;
-// VZEROALL.
-def WriteVZEROALL : SchedWriteRes<[]> {
- let NumMicroOps = 12;
+def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 30;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
}
-def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT32ir")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT32rr")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT8ir")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT8rr")>;
-// LDMXCSR.
-def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
- let Latency = 6;
+def HWWriteResGroup172 : SchedWriteRes<[HWPort01,HWPort15,HWPort015,HWPort0156]> {
+ let Latency = 31;
+ let NumMicroOps = 31;
+ let ResourceCycles = [8,1,21,1];
+}
+def: InstRW<[HWWriteResGroup172], (instregex "MMX_EMMS")>;
+
+def HWWriteResGroup173 : SchedWriteRes<[HWPort0,HWPort015]> {
+ let Latency = 35;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+def: InstRW<[HWWriteResGroup173], (instregex "VDIVPDYrr")>;
+def: InstRW<[HWWriteResGroup173], (instregex "VSQRTPDYr")>;
-// STMXCSR.
-def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
- let Latency = 7;
+def HWWriteResGroup174 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+ let Latency = 35;
let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[HWWriteResGroup174], (instregex "VDIVPDYrm")>;
+def: InstRW<[HWWriteResGroup174], (instregex "VSQRTPDYm")>;
+
+def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[HWWriteResGroup175], (instregex "VMCLEARm")>;
+
+def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[HWWriteResGroup176], (instregex "RDTSCP")>;
+
+def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> {
+ let Latency = 56;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,2,8,1,10,2,39];
+}
+def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
+def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
+
+def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 59;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[HWWriteResGroup178], (instregex "FXRSTOR64")>;
+
+def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 59;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[HWWriteResGroup179], (instregex "FXRSTOR")>;
+
+def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[HWWriteResGroup180], (instregex "FNINIT")>;
+
+def HWWriteResGroup181 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
+ let Latency = 98;
+ let NumMicroOps = 32;
+ let ResourceCycles = [7,7,3,3,1,11];
+}
+def: InstRW<[HWWriteResGroup181], (instregex "DIV(16|32|64)r")>;
+
+def HWWriteResGroup182 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 112;
+ let NumMicroOps = 66;
+ let ResourceCycles = [4,2,4,8,14,34];
+}
+def: InstRW<[HWWriteResGroup182], (instregex "IDIV(16|32|64)r")>;
+
+def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 114;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,9,11,8,1,11,21,30];
}
-def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
+def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
} // SchedModel
diff --git a/llvm/test/CodeGen/X86/aes-schedule.ll b/llvm/test/CodeGen/X86/aes-schedule.ll
index 372e395e088..ab86edac0b4 100644
--- a/llvm/test/CodeGen/X86/aes-schedule.ll
+++ b/llvm/test/CodeGen/X86/aes-schedule.ll
@@ -32,7 +32,7 @@ define <2 x i64> @test_aesdec(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_aesdec:
; BTVER2: # BB#0:
@@ -75,7 +75,7 @@ define <2 x i64> @test_aesdeclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; HASWELL: # BB#0:
; HASWELL-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_aesdeclast:
; BTVER2: # BB#0:
@@ -118,7 +118,7 @@ define <2 x i64> @test_aesenc(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_aesenc:
; BTVER2: # BB#0:
@@ -161,7 +161,7 @@ define <2 x i64> @test_aesenclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; HASWELL: # BB#0:
; HASWELL-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_aesenclast:
; BTVER2: # BB#0:
@@ -208,7 +208,7 @@ define <2 x i64> @test_aesimc(<2 x i64> %a0, <2 x i64> *%a1) {
; HASWELL-NEXT: vaesimc %xmm0, %xmm0 # sched: [14:2.00]
; HASWELL-NEXT: vaesimc (%rdi), %xmm1 # sched: [14:2.00]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_aesimc:
; BTVER2: # BB#0:
@@ -255,10 +255,10 @@ define <2 x i64> @test_aeskeygenassist(<2 x i64> %a0, <2 x i64> *%a1) {
;
; HASWELL-LABEL: test_aeskeygenassist:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [10:8.00]
-; HASWELL-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [10:7.00]
+; HASWELL-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [29:7.00]
+; HASWELL-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [28:7.00]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_aeskeygenassist:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll
index 23b30b5d316..17d2d9146c0 100644
--- a/llvm/test/CodeGen/X86/avx-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx-schedule.ll
@@ -23,8 +23,8 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addpd:
; BTVER2: # BB#0:
@@ -59,8 +59,8 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addps:
; BTVER2: # BB#0:
@@ -95,8 +95,8 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubpd:
; BTVER2: # BB#0:
@@ -132,8 +132,8 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubps:
; BTVER2: # BB#0:
@@ -171,9 +171,9 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotpd:
; BTVER2: # BB#0:
@@ -219,9 +219,9 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotps:
; BTVER2: # BB#0:
@@ -267,9 +267,9 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andpd:
; BTVER2: # BB#0:
@@ -313,9 +313,9 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andps:
; BTVER2: # BB#0:
@@ -360,8 +360,8 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendpd:
; BTVER2: # BB#0:
@@ -399,8 +399,8 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
-; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendps:
; BTVER2: # BB#0:
@@ -435,8 +435,8 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvpd:
; BTVER2: # BB#0:
@@ -472,8 +472,8 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvps:
; BTVER2: # BB#0:
@@ -506,8 +506,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
;
; HASWELL-LABEL: test_broadcastf128:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastf128:
; BTVER2: # BB#0:
@@ -536,8 +536,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
;
; HASWELL-LABEL: test_broadcastsd_ymm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastsd_ymm:
; BTVER2: # BB#0:
@@ -567,8 +567,8 @@ define <4 x float> @test_broadcastss(float *%a0) {
;
; HASWELL-LABEL: test_broadcastss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastss:
; BTVER2: # BB#0:
@@ -598,8 +598,8 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
;
; HASWELL-LABEL: test_broadcastss_ymm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_broadcastss_ymm:
; BTVER2: # BB#0:
@@ -634,9 +634,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmppd:
; BTVER2: # BB#0:
@@ -679,9 +679,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpps:
; BTVER2: # BB#0:
@@ -724,9 +724,9 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00]
-; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [6:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2pd:
; BTVER2: # BB#0:
@@ -767,10 +767,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [3:1.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2ps:
; BTVER2: # BB#0:
@@ -810,9 +810,9 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
-; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00]
+; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2dq:
; BTVER2: # BB#0:
@@ -851,10 +851,10 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
;
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2ps:
; BTVER2: # BB#0:
@@ -894,9 +894,9 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [3:1.00]
; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtps2dq:
; BTVER2: # BB#0:
@@ -933,9 +933,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00]
-; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divpd:
; BTVER2: # BB#0:
@@ -969,9 +969,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divps:
; BTVER2: # BB#0:
@@ -1006,8 +1006,8 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00]
-; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_dpps:
; BTVER2: # BB#0:
@@ -1045,9 +1045,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
; HASWELL-LABEL: test_extractf128:
; HASWELL: # BB#0:
; HASWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_extractf128:
; BTVER2: # BB#0:
@@ -1083,8 +1083,8 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddpd:
; BTVER2: # BB#0:
@@ -1120,8 +1120,8 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddps:
; BTVER2: # BB#0:
@@ -1157,8 +1157,8 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubpd:
; BTVER2: # BB#0:
@@ -1194,8 +1194,8 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubps:
; BTVER2: # BB#0:
@@ -1233,9 +1233,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
; HASWELL-LABEL: test_insertf128:
; HASWELL: # BB#0:
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_insertf128:
; BTVER2: # BB#0:
@@ -1272,8 +1272,8 @@ define <32 x i8> @test_lddqu(i8* %a0) {
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lddqu:
; BTVER2: # BB#0:
@@ -1306,10 +1306,10 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
;
; HASWELL-LABEL: test_maskmovpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
; HASWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovpd:
; BTVER2: # BB#0:
@@ -1348,10 +1348,10 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2
;
; HASWELL-LABEL: test_maskmovpd_ymm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [2:2.00]
+; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [4:1.00]
; HASWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovpd_ymm:
; BTVER2: # BB#0:
@@ -1390,10 +1390,10 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
;
; HASWELL-LABEL: test_maskmovps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [2:2.00]
+; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:1.00]
; HASWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovps:
; BTVER2: # BB#0:
@@ -1432,10 +1432,10 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
;
; HASWELL-LABEL: test_maskmovps_ymm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [2:2.00]
+; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [4:1.00]
; HASWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovps_ymm:
; BTVER2: # BB#0:
@@ -1473,8 +1473,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxpd:
; BTVER2: # BB#0:
@@ -1510,8 +1510,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxps:
; BTVER2: # BB#0:
@@ -1547,8 +1547,8 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minpd:
; BTVER2: # BB#0:
@@ -1584,8 +1584,8 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minps:
; BTVER2: # BB#0:
@@ -1622,10 +1622,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movapd:
; BTVER2: # BB#0:
@@ -1663,10 +1663,10 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movaps:
; BTVER2: # BB#0:
@@ -1705,9 +1705,9 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [1:0.50]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movddup:
; BTVER2: # BB#0:
@@ -1744,9 +1744,9 @@ define i32 @test_movmskpd(<4 x double> %a0) {
;
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskpd:
; BTVER2: # BB#0:
@@ -1778,9 +1778,9 @@ define i32 @test_movmskps(<8 x float> %a0) {
;
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskps:
; BTVER2: # BB#0:
@@ -1814,7 +1814,7 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntpd:
; BTVER2: # BB#0:
@@ -1849,7 +1849,7 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntps:
; BTVER2: # BB#0:
@@ -1885,9 +1885,9 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [1:0.50]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movshdup:
; BTVER2: # BB#0:
@@ -1927,9 +1927,9 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [1:0.50]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsldup:
; BTVER2: # BB#0:
@@ -1970,10 +1970,10 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movupd:
; BTVER2: # BB#0:
@@ -2013,10 +2013,10 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movups:
; BTVER2: # BB#0:
@@ -2052,9 +2052,9 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
;
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulpd:
; BTVER2: # BB#0:
@@ -2088,9 +2088,9 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
;
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulps:
; BTVER2: # BB#0:
@@ -2127,9 +2127,9 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
; HASWELL-LABEL: orpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: orpd:
; BTVER2: # BB#0:
@@ -2173,9 +2173,9 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_orps:
; BTVER2: # BB#0:
@@ -2219,9 +2219,9 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
; HASWELL-LABEL: test_permilpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilpd:
; BTVER2: # BB#0:
@@ -2261,9 +2261,9 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
; HASWELL-LABEL: test_permilpd_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
+; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilpd_ymm:
; BTVER2: # BB#0:
@@ -2303,9 +2303,9 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_permilps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilps:
; BTVER2: # BB#0:
@@ -2345,9 +2345,9 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
; HASWELL-LABEL: test_permilps_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
-; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [1:1.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilps_ymm:
; BTVER2: # BB#0:
@@ -2385,8 +2385,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
; HASWELL-LABEL: test_permilvarpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarpd:
; BTVER2: # BB#0:
@@ -2422,8 +2422,8 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
; HASWELL-LABEL: test_permilvarpd_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarpd_ymm:
; BTVER2: # BB#0:
@@ -2459,8 +2459,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
; HASWELL-LABEL: test_permilvarps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarps:
; BTVER2: # BB#0:
@@ -2496,8 +2496,8 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
; HASWELL-LABEL: test_permilvarps_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_permilvarps_ymm:
; BTVER2: # BB#0:
@@ -2535,9 +2535,9 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rcpps:
; BTVER2: # BB#0:
@@ -2577,10 +2577,10 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [5:1.25]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [6:2.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundpd:
; BTVER2: # BB#0:
@@ -2620,10 +2620,10 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [5:1.25]
+; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [6:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundps:
; BTVER2: # BB#0:
@@ -2664,9 +2664,9 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [11:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rsqrtps:
; BTVER2: # BB#0:
@@ -2707,9 +2707,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufpd:
; BTVER2: # BB#0:
@@ -2747,8 +2747,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufps:
; BTVER2: # BB#0:
@@ -2784,10 +2784,10 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [32:2.00]
-; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [28:2.00]
+; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [35:2.00]
+; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [35:2.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtpd:
; BTVER2: # BB#0:
@@ -2827,10 +2827,10 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [23:2.00]
-; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [19:2.00]
+; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [21:2.00]
+; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtps:
; BTVER2: # BB#0:
@@ -2869,8 +2869,8 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subpd:
; BTVER2: # BB#0:
@@ -2905,8 +2905,8 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subps:
; BTVER2: # BB#0:
@@ -2947,11 +2947,11 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; HASWELL-LABEL: test_testpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testpd:
; BTVER2: # BB#0:
@@ -3002,12 +3002,12 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
; HASWELL-LABEL: test_testpd_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: vzeroupper # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testpd_ymm:
; BTVER2: # BB#0:
@@ -3057,11 +3057,11 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; HASWELL-LABEL: test_testps:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testps:
; BTVER2: # BB#0:
@@ -3112,12 +3112,12 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
; HASWELL-LABEL: test_testps_ymm:
; HASWELL: # BB#0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: vzeroupper # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_testps_ymm:
; BTVER2: # BB#0:
@@ -3163,9 +3163,9 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhpd:
; BTVER2: # BB#0:
@@ -3203,8 +3203,8 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhps:
; BTVER2: # BB#0:
@@ -3241,9 +3241,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklpd:
; BTVER2: # BB#0:
@@ -3281,8 +3281,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklps:
; BTVER2: # BB#0:
@@ -3319,9 +3319,9 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorpd:
; BTVER2: # BB#0:
@@ -3365,9 +3365,9 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorps:
; BTVER2: # BB#0:
@@ -3406,8 +3406,8 @@ define void @test_zeroall() {
;
; HASWELL-LABEL: test_zeroall:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vzeroall # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroall # sched: [16:16.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_zeroall:
; BTVER2: # BB#0:
@@ -3436,8 +3436,8 @@ define void @test_zeroupper() {
;
; HASWELL-LABEL: test_zeroupper:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vzeroupper # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_zeroupper:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll
index 971ebe5e711..7dbe2dd98e8 100644
--- a/llvm/test/CodeGen/X86/avx2-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx2-schedule.ll
@@ -15,9 +15,9 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
; HASWELL-LABEL: test_pabsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pabsb:
; ZNVER1: # BB#0:
@@ -44,9 +44,9 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
; HASWELL-LABEL: test_pabsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pabsd:
; ZNVER1: # BB#0:
@@ -73,9 +73,9 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
; HASWELL-LABEL: test_pabsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pabsw:
; ZNVER1: # BB#0:
@@ -101,8 +101,8 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
; HASWELL-LABEL: test_paddb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddb:
; ZNVER1: # BB#0:
@@ -125,8 +125,8 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
; HASWELL-LABEL: test_paddd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddd:
; ZNVER1: # BB#0:
@@ -149,8 +149,8 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_paddq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddq:
; ZNVER1: # BB#0:
@@ -173,8 +173,8 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
; HASWELL-LABEL: test_paddw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_paddw:
; ZNVER1: # BB#0:
@@ -198,9 +198,9 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_pand:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pand:
; ZNVER1: # BB#0:
@@ -226,9 +226,9 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_pandn:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pandn:
; ZNVER1: # BB#0:
@@ -256,7 +256,7 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
; HASWELL-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pmulld:
; ZNVER1: # BB#0:
@@ -279,8 +279,8 @@ define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2)
; HASWELL-LABEL: test_pmullw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pmullw:
; ZNVER1: # BB#0:
@@ -304,9 +304,9 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_por:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_por:
; ZNVER1: # BB#0:
@@ -331,8 +331,8 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
; HASWELL-LABEL: test_psubb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubb:
; ZNVER1: # BB#0:
@@ -355,8 +355,8 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
; HASWELL-LABEL: test_psubd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubd:
; ZNVER1: # BB#0:
@@ -379,8 +379,8 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_psubq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubq:
; ZNVER1: # BB#0:
@@ -403,8 +403,8 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
; HASWELL-LABEL: test_psubw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_psubw:
; ZNVER1: # BB#0:
@@ -428,9 +428,9 @@ define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
; HASWELL-LABEL: test_pxor:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pxor:
; ZNVER1: # BB#0:
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index b5a13404a23..d2e95f692e4 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -126,11 +126,11 @@ entry:
define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
; ALL-LABEL: test8:
; ALL: ## BB#0:
-; ALL-NEXT: notl %edi
; ALL-NEXT: xorl $-2147483648, %esi ## imm = 0x80000000
; ALL-NEXT: testl %edx, %edx
; ALL-NEXT: movl $1, %eax
; ALL-NEXT: cmovel %eax, %edx
+; ALL-NEXT: notl %edi
; ALL-NEXT: orl %edi, %esi
; ALL-NEXT: cmovnel %edx, %eax
; ALL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index bc53933423f..25559dd39a0 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -1530,19 +1530,19 @@ define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
}
define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; NOVL-LABEL: uitofp_2i1_float:
-; NOVL: # BB#0:
-; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: vpextrb $8, %xmm0, %eax
-; NOVL-NEXT: andl $1, %eax
-; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
-; NOVL-NEXT: vpextrb $0, %xmm0, %eax
-; NOVL-NEXT: andl $1, %eax
-; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
-; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; NOVL-NEXT: retq
+; KNL-LABEL: uitofp_2i1_float:
+; KNL: # BB#0:
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; KNL-NEXT: vpextrb $8, %xmm0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: vpextrb $0, %xmm0, %ecx
+; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; KNL-NEXT: andl $1, %ecx
+; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; KNL-NEXT: retq
;
; VL-LABEL: uitofp_2i1_float:
; VL: # BB#0:
@@ -1552,6 +1552,34 @@ define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: uitofp_2i1_float:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512DQ-NEXT: andl $1, %eax
+; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512DQ-NEXT: andl $1, %eax
+; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: uitofp_2i1_float:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: andl $1, %eax
+; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: andl $1, %eax
+; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; AVX512BW-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x float>
ret <2 x float> %1
diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll
index 1a76a9fb3bc..50e8484874e 100644
--- a/llvm/test/CodeGen/X86/avx512-ext.ll
+++ b/llvm/test/CodeGen/X86/avx512-ext.ll
@@ -48,8 +48,8 @@ define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re
define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_16x8mem_to_16x16:
; KNL: # BB#0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -70,8 +70,8 @@ define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi
define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_16x8mem_to_16x16:
; KNL: # BB#0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpmovsxbw (%rdi), %ymm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index eef49eda9c7..122c1183a99 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -936,7 +936,6 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: setb %al
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
@@ -1062,6 +1061,7 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT: setb %al
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
@@ -1112,23 +1112,23 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-LABEL: test_iinsertelement_v4i1:
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: setb %al
; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpextrb $4, %xmm0, %ecx
-; KNL-NEXT: kmovw %ecx, %k1
+; KNL-NEXT: vpextrb $4, %xmm0, %eax
+; KNL-NEXT: setb %cl
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vpextrb $0, %xmm0, %ecx
-; KNL-NEXT: kmovw %ecx, %k1
+; KNL-NEXT: vpextrb $0, %xmm0, %eax
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
@@ -1902,14 +1902,23 @@ define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
}
define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
-; CHECK-LABEL: test_extractelement_variable_v16i8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: andl $15, %edi
-; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT: movb (%rdi,%rax), %al
-; CHECK-NEXT: retq
+; KNL-LABEL: test_extractelement_variable_v16i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: andl $15, %edi
+; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; KNL-NEXT: movb (%rdi,%rax), %al
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_extractelement_variable_v16i8:
+; SKX: ## BB#0:
+; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT: andl $15, %edi
+; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SKX-NEXT: movb (%rdi,%rax), %al
+; SKX-NEXT: retq
%t2 = extractelement <16 x i8> %t1, i32 %index
ret i8 %t2
}
@@ -1927,8 +1936,8 @@ define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movq %rsp, %rax
; KNL-NEXT: movb (%rdi,%rax), %al
@@ -1975,9 +1984,9 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $63, %edi
; KNL-NEXT: movq %rsp, %rax
; KNL-NEXT: movb (%rdi,%rax), %al
@@ -2066,12 +2075,12 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v2i1:
; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $1, %edi
; KNL-NEXT: movl -24(%rsp,%rdi,8), %eax
; KNL-NEXT: andl $1, %eax
@@ -2096,12 +2105,12 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b,
define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v4i1:
; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $3, %edi
; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax
; KNL-NEXT: andl $1, %eax
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 95679dc217e..afb463d9fe4 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -2880,7 +2880,6 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8
define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
; CHECK-LABEL: test_mask_vextractf32x4:
; CHECK: ## BB#0:
-; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: kshiftlw $12, %k0, %k1
; CHECK-NEXT: kshiftrw $15, %k1, %k1
@@ -2898,6 +2897,7 @@ define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8
; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -2941,7 +2941,6 @@ declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i
define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
; CHECK-LABEL: test_maskz_vextracti32x4:
; CHECK: ## BB#0:
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: kshiftlw $12, %k0, %k1
; CHECK-NEXT: kshiftrw $15, %k1, %k1
@@ -2959,6 +2958,7 @@ define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
; CHECK-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1
; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 1ae57c613cd..f6d752ddc3c 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1835,73 +1835,8 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
-; KNL-NEXT: vmovups (%rdi), %zmm2
-; KNL-NEXT: vmovups 64(%rdi), %zmm3
-; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
-; KNL-NEXT: kshiftlw $14, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $15, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm3
-; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $13, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $12, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $11, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $10, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $9, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $8, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $7, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $6, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $5, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $4, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $3, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $2, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $1, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftrw $15, %k1, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT: vmovups 64(%rdi), %zmm2
+; KNL-NEXT: vcmpltps %zmm1, %zmm2, %k2
; KNL-NEXT: kshiftlw $14, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1965,138 +1900,203 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: kshiftrw $15, %k2, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z}
-; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT: vmovups (%rdi), %zmm3
+; KNL-NEXT: vcmpltps %zmm0, %zmm3, %k1
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftrw $15, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $14, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %ecx
; KNL-NEXT: vmovd %ecx, %xmm4
; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $13, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $12, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $11, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $10, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $9, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $8, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $7, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $6, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $5, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $4, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $3, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $2, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $1, %k0, %k2
+; KNL-NEXT: kshiftrw $15, %k2, %k2
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0
+; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
+; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm3
-; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: vmovd %ecx, %xmm5
+; KNL-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; KNL-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
@@ -2941,36 +2941,6 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
;
; KNL-LABEL: store_64i1:
; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi9:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: pushq %r15
-; KNL-NEXT: Lcfi10:
-; KNL-NEXT: .cfi_def_cfa_offset 24
-; KNL-NEXT: pushq %r14
-; KNL-NEXT: Lcfi11:
-; KNL-NEXT: .cfi_def_cfa_offset 32
-; KNL-NEXT: pushq %r13
-; KNL-NEXT: Lcfi12:
-; KNL-NEXT: .cfi_def_cfa_offset 40
-; KNL-NEXT: pushq %r12
-; KNL-NEXT: Lcfi13:
-; KNL-NEXT: .cfi_def_cfa_offset 48
-; KNL-NEXT: pushq %rbx
-; KNL-NEXT: Lcfi14:
-; KNL-NEXT: .cfi_def_cfa_offset 56
-; KNL-NEXT: Lcfi15:
-; KNL-NEXT: .cfi_offset %rbx, -56
-; KNL-NEXT: Lcfi16:
-; KNL-NEXT: .cfi_offset %r12, -48
-; KNL-NEXT: Lcfi17:
-; KNL-NEXT: .cfi_offset %r13, -40
-; KNL-NEXT: Lcfi18:
-; KNL-NEXT: .cfi_offset %r14, -32
-; KNL-NEXT: Lcfi19:
-; KNL-NEXT: .cfi_offset %r15, -24
-; KNL-NEXT: Lcfi20:
-; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -2982,66 +2952,66 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: vpinsrb $8, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: vpinsrb $11, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %r9d, %xmm3
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm2
+; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
@@ -3050,66 +3020,66 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
; KNL-NEXT: kmovw %k0, 6(%rdi)
; KNL-NEXT: kshiftlw $14, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $15, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $13, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $12, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: vmovd %ecx, %xmm2
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $11, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $10, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $9, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $8, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $7, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $6, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $5, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $4, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $3, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $2, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $1, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %r10d, %xmm2
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL-NEXT: kshiftrw $15, %k2, %k0
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
-; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm1
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -3118,145 +3088,139 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
; KNL-NEXT: kmovw %k0, 4(%rdi)
; KNL-NEXT: kshiftlw $14, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $15, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $13, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $12, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $11, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $10, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $9, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $8, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $7, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vpinsrb $5, %edx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $6, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $5, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $4, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $3, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $2, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $1, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %r10d, %xmm1
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vpinsrb $11, %edx, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
-; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: kmovw %k1, 2(%rdi)
+; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm0
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: vmovd %edx, %xmm1
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: vpinsrb $3, %edx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: vpinsrb $9, %edx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %r9d, %xmm0
-; KNL-NEXT: kmovw %k1, %r9d
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vpinsrb $12, %edx, %xmm1, %xmm1
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm0
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
-; KNL-NEXT: popq %rbx
-; KNL-NEXT: popq %r12
-; KNL-NEXT: popq %r13
-; KNL-NEXT: popq %r14
-; KNL-NEXT: popq %r15
-; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: store_64i1:
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index b3fbceea80a..3406fef9f13 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -269,8 +269,6 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
@@ -327,11 +325,13 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -577,75 +577,75 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
+; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index c6e1dbd8811..a788468cac3 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1685,8 +1685,6 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: .cfi_offset %esi, -12
; AVX512F-32-NEXT: .Lcfi9:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
@@ -1707,39 +1705,39 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
@@ -1748,8 +1746,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
@@ -1758,8 +1756,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -1767,8 +1765,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -1777,8 +1775,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -1789,8 +1787,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -1798,8 +1796,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1809,8 +1807,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1820,8 +1818,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1831,8 +1829,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1842,8 +1840,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1852,8 +1850,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1864,8 +1862,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %bl
@@ -1877,8 +1875,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
@@ -1887,8 +1885,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1898,8 +1896,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1910,8 +1908,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -1921,8 +1919,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -1932,464 +1930,464 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: movl %ecx, %esi
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $30, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
; AVX512F-32-NEXT: shrl $12, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
; AVX512F-32-NEXT: shrl $14, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
; AVX512F-32-NEXT: shrl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrl $16, %ebx
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: andb $15, %al
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
; AVX512F-32-NEXT: shrb $7, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
@@ -2397,29 +2395,29 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp)
@@ -2571,8 +2569,6 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: .cfi_offset %esi, -12
; AVX512F-32-NEXT: .Lcfi15:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
@@ -2593,39 +2589,39 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
@@ -2634,8 +2630,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
@@ -2644,8 +2640,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -2653,8 +2649,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -2663,8 +2659,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -2675,8 +2671,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -2684,8 +2680,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2695,8 +2691,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2706,8 +2702,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2717,8 +2713,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2728,8 +2724,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2738,8 +2734,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2750,8 +2746,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %bl
@@ -2763,8 +2759,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
@@ -2773,8 +2769,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2784,8 +2780,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2796,8 +2792,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
@@ -2807,8 +2803,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
@@ -2818,464 +2814,464 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: movl %ecx, %esi
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $30, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %esi, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %cl, %al
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
; AVX512F-32-NEXT: shrl $12, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
; AVX512F-32-NEXT: shrl $14, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
; AVX512F-32-NEXT: shrl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %ebx
; AVX512F-32-NEXT: shrl $16, %ebx
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: andb $15, %al
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: shrb $2, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %bl, %al
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
; AVX512F-32-NEXT: shrb $7, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512F-32-NEXT: movb %al, %dl
; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: movb %dl, %al
; AVX512F-32-NEXT: shrb $2, %dl
; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
+; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
+; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
@@ -3283,29 +3279,29 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index f4504ed07fc..19dd83fb180 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -2750,23 +2750,23 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2848,23 +2848,23 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
-; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
-; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index 482a054a5ff..fb297a90fad 100644
--- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -6,7 +6,6 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32,
define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm0
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: kshiftlb $7, %k0, %k1
; CHECK-NEXT: kshiftrb $7, %k1, %k1
@@ -16,6 +15,7 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0,
; CHECK-NEXT: kmovw %k1, %ecx
; CHECK-NEXT: vmovd %ecx, %xmm2
; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm0
; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2
; CHECK-NEXT: vpsraq $63, %zmm2, %zmm2
; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll
index 43b1f53a09f..e7d8c889a02 100644
--- a/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -314,8 +314,8 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
;
; NoVLX-LABEL: test256_11:
; NoVLX: # BB#0:
-; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm3
+; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; NoVLX-NEXT: vpand %ymm2, %ymm3, %ymm2
; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: retq
@@ -824,8 +824,8 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
;
; NoVLX-LABEL: test128_11:
; NoVLX: # BB#0:
-; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm3
+; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index c8a1682e42f..f0d23e48716 100644
--- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -19,23 +19,8 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movq %rsp, %rbp
; NoVLX-NEXT: .Lcfi2:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi3:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi4:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi5:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi6:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi7:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -44,64 +29,64 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -110,12 +95,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -137,30 +117,15 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi8:
+; NoVLX-NEXT: .Lcfi3:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi9:
+; NoVLX-NEXT: .Lcfi4:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi10:
+; NoVLX-NEXT: .Lcfi5:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi11:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi12:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi13:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi14:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi15:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -169,64 +134,64 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -235,12 +200,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -264,30 +224,15 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi16:
+; NoVLX-NEXT: .Lcfi6:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi17:
+; NoVLX-NEXT: .Lcfi7:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi18:
+; NoVLX-NEXT: .Lcfi8:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi19:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi20:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi21:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi22:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi23:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -297,64 +242,64 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -363,12 +308,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -393,30 +333,15 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi24:
+; NoVLX-NEXT: .Lcfi9:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi25:
+; NoVLX-NEXT: .Lcfi10:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi26:
+; NoVLX-NEXT: .Lcfi11:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi27:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi28:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi29:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi30:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi31:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -426,64 +351,64 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -492,12 +417,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -523,12 +443,12 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi32:
+; NoVLX-NEXT: .Lcfi12:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi33:
+; NoVLX-NEXT: .Lcfi13:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi34:
+; NoVLX-NEXT: .Lcfi14:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -537,24 +457,20 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi35:
+; NoVLX-NEXT: .Lcfi15:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi36:
+; NoVLX-NEXT: .Lcfi16:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi37:
+; NoVLX-NEXT: .Lcfi17:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi38:
+; NoVLX-NEXT: .Lcfi18:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi39:
+; NoVLX-NEXT: .Lcfi19:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -597,11 +513,11 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -613,11 +529,15 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -653,12 +573,12 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi40:
+; NoVLX-NEXT: .Lcfi20:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi41:
+; NoVLX-NEXT: .Lcfi21:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi42:
+; NoVLX-NEXT: .Lcfi22:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -667,24 +587,20 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi43:
+; NoVLX-NEXT: .Lcfi23:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi44:
+; NoVLX-NEXT: .Lcfi24:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi45:
+; NoVLX-NEXT: .Lcfi25:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi46:
+; NoVLX-NEXT: .Lcfi26:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi47:
+; NoVLX-NEXT: .Lcfi27:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -727,11 +643,11 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -743,11 +659,15 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -785,12 +705,12 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi48:
+; NoVLX-NEXT: .Lcfi28:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi49:
+; NoVLX-NEXT: .Lcfi29:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi50:
+; NoVLX-NEXT: .Lcfi30:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -799,25 +719,21 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi51:
+; NoVLX-NEXT: .Lcfi31:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi52:
+; NoVLX-NEXT: .Lcfi32:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi53:
+; NoVLX-NEXT: .Lcfi33:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi54:
+; NoVLX-NEXT: .Lcfi34:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi55:
+; NoVLX-NEXT: .Lcfi35:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -860,11 +776,11 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -876,11 +792,15 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -919,12 +839,12 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi56:
+; NoVLX-NEXT: .Lcfi36:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi57:
+; NoVLX-NEXT: .Lcfi37:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi58:
+; NoVLX-NEXT: .Lcfi38:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -933,25 +853,21 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi59:
+; NoVLX-NEXT: .Lcfi39:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi60:
+; NoVLX-NEXT: .Lcfi40:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi61:
+; NoVLX-NEXT: .Lcfi41:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi62:
+; NoVLX-NEXT: .Lcfi42:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi63:
+; NoVLX-NEXT: .Lcfi43:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -994,11 +910,11 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -1010,11 +926,15 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1055,12 +975,12 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi64:
+; NoVLX-NEXT: .Lcfi44:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi65:
+; NoVLX-NEXT: .Lcfi45:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi66:
+; NoVLX-NEXT: .Lcfi46:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1104,12 +1024,12 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi67:
+; NoVLX-NEXT: .Lcfi47:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi68:
+; NoVLX-NEXT: .Lcfi48:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi69:
+; NoVLX-NEXT: .Lcfi49:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1155,12 +1075,12 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi70:
+; NoVLX-NEXT: .Lcfi50:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi71:
+; NoVLX-NEXT: .Lcfi51:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi72:
+; NoVLX-NEXT: .Lcfi52:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -1216,12 +1136,12 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi73:
+; NoVLX-NEXT: .Lcfi53:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi74:
+; NoVLX-NEXT: .Lcfi54:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi75:
+; NoVLX-NEXT: .Lcfi55:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -1400,12 +1320,12 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi76:
+; NoVLX-NEXT: .Lcfi56:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi77:
+; NoVLX-NEXT: .Lcfi57:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi78:
+; NoVLX-NEXT: .Lcfi58:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1475,12 +1395,12 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi79:
+; NoVLX-NEXT: .Lcfi59:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi80:
+; NoVLX-NEXT: .Lcfi60:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi81:
+; NoVLX-NEXT: .Lcfi61:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1552,12 +1472,12 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi82:
+; NoVLX-NEXT: .Lcfi62:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi83:
+; NoVLX-NEXT: .Lcfi63:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi84:
+; NoVLX-NEXT: .Lcfi64:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1631,12 +1551,12 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi85:
+; NoVLX-NEXT: .Lcfi65:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi86:
+; NoVLX-NEXT: .Lcfi66:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi87:
+; NoVLX-NEXT: .Lcfi67:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1711,12 +1631,12 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi88:
+; NoVLX-NEXT: .Lcfi68:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi89:
+; NoVLX-NEXT: .Lcfi69:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi90:
+; NoVLX-NEXT: .Lcfi70:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1724,43 +1644,43 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1791,12 +1711,12 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi91:
+; NoVLX-NEXT: .Lcfi71:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi92:
+; NoVLX-NEXT: .Lcfi72:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi93:
+; NoVLX-NEXT: .Lcfi73:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1804,43 +1724,43 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1873,12 +1793,12 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi94:
+; NoVLX-NEXT: .Lcfi74:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi95:
+; NoVLX-NEXT: .Lcfi75:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi96:
+; NoVLX-NEXT: .Lcfi76:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1887,43 +1807,43 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1957,12 +1877,12 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi97:
+; NoVLX-NEXT: .Lcfi77:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi98:
+; NoVLX-NEXT: .Lcfi78:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi99:
+; NoVLX-NEXT: .Lcfi79:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1971,43 +1891,43 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2043,30 +1963,15 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi100:
+; NoVLX-NEXT: .Lcfi80:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi101:
+; NoVLX-NEXT: .Lcfi81:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi102:
+; NoVLX-NEXT: .Lcfi82:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi103:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi104:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi105:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi106:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi107:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -2075,64 +1980,64 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2141,12 +2046,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -2169,30 +2069,15 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi108:
+; NoVLX-NEXT: .Lcfi83:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi109:
+; NoVLX-NEXT: .Lcfi84:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi110:
+; NoVLX-NEXT: .Lcfi85:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi111:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi112:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi113:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi114:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi115:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -2201,64 +2086,64 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2267,12 +2152,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -2297,30 +2177,15 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi116:
+; NoVLX-NEXT: .Lcfi86:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi117:
+; NoVLX-NEXT: .Lcfi87:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi118:
+; NoVLX-NEXT: .Lcfi88:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi119:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi120:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi121:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi122:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi123:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -2330,64 +2195,64 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2396,12 +2261,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -2427,30 +2287,15 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi124:
+; NoVLX-NEXT: .Lcfi89:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi125:
+; NoVLX-NEXT: .Lcfi90:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi126:
+; NoVLX-NEXT: .Lcfi91:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi127:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi128:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi129:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi130:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi131:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -2460,64 +2305,64 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2526,12 +2371,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -2558,12 +2398,12 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi132:
+; NoVLX-NEXT: .Lcfi92:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi133:
+; NoVLX-NEXT: .Lcfi93:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi134:
+; NoVLX-NEXT: .Lcfi94:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -2572,24 +2412,20 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi135:
+; NoVLX-NEXT: .Lcfi95:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi136:
+; NoVLX-NEXT: .Lcfi96:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi137:
+; NoVLX-NEXT: .Lcfi97:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi138:
+; NoVLX-NEXT: .Lcfi98:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi139:
+; NoVLX-NEXT: .Lcfi99:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -2632,11 +2468,11 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2648,11 +2484,15 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2689,12 +2529,12 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi140:
+; NoVLX-NEXT: .Lcfi100:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi141:
+; NoVLX-NEXT: .Lcfi101:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi142:
+; NoVLX-NEXT: .Lcfi102:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -2703,24 +2543,20 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi143:
+; NoVLX-NEXT: .Lcfi103:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi144:
+; NoVLX-NEXT: .Lcfi104:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi145:
+; NoVLX-NEXT: .Lcfi105:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi146:
+; NoVLX-NEXT: .Lcfi106:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi147:
+; NoVLX-NEXT: .Lcfi107:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -2763,11 +2599,11 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2779,11 +2615,15 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2822,12 +2662,12 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi148:
+; NoVLX-NEXT: .Lcfi108:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi149:
+; NoVLX-NEXT: .Lcfi109:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi150:
+; NoVLX-NEXT: .Lcfi110:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -2836,25 +2676,21 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi151:
+; NoVLX-NEXT: .Lcfi111:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi152:
+; NoVLX-NEXT: .Lcfi112:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi153:
+; NoVLX-NEXT: .Lcfi113:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi154:
+; NoVLX-NEXT: .Lcfi114:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi155:
+; NoVLX-NEXT: .Lcfi115:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -2897,11 +2733,11 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2913,11 +2749,15 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2957,12 +2797,12 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi156:
+; NoVLX-NEXT: .Lcfi116:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi157:
+; NoVLX-NEXT: .Lcfi117:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi158:
+; NoVLX-NEXT: .Lcfi118:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -2971,25 +2811,21 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi159:
+; NoVLX-NEXT: .Lcfi119:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi160:
+; NoVLX-NEXT: .Lcfi120:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi161:
+; NoVLX-NEXT: .Lcfi121:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi162:
+; NoVLX-NEXT: .Lcfi122:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi163:
+; NoVLX-NEXT: .Lcfi123:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -3032,11 +2868,11 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -3048,11 +2884,15 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -3093,62 +2933,58 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi164:
+; NoVLX-NEXT: .Lcfi124:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi165:
+; NoVLX-NEXT: .Lcfi125:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi166:
+; NoVLX-NEXT: .Lcfi126:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm5
@@ -3156,190 +2992,194 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
; NoVLX-NEXT: vmovq %xmm6, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
@@ -3444,69 +3284,68 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi167:
+; NoVLX-NEXT: .Lcfi127:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi168:
+; NoVLX-NEXT: .Lcfi128:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi169:
+; NoVLX-NEXT: .Lcfi129:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
@@ -3514,7 +3353,8 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
@@ -3524,19 +3364,19 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1
; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -3712,12 +3552,12 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi170:
+; NoVLX-NEXT: .Lcfi130:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi171:
+; NoVLX-NEXT: .Lcfi131:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi172:
+; NoVLX-NEXT: .Lcfi132:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -3728,17 +3568,12 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: movq %rax, %rdx
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -3746,9 +3581,10 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm4
@@ -3766,39 +3602,40 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
@@ -3806,69 +3643,72 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT: vmovq %xmm5, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm6
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT: vmovq %xmm6, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm7
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm2
@@ -3877,18 +3717,13 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4
; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -3897,147 +3732,152 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm4, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm2
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -4075,12 +3915,12 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi173:
+; NoVLX-NEXT: .Lcfi133:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi174:
+; NoVLX-NEXT: .Lcfi134:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi175:
+; NoVLX-NEXT: .Lcfi135:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -4092,8 +3932,6 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
@@ -4106,19 +3944,20 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -4126,6 +3965,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
@@ -4148,174 +3988,174 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2
-; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm2, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm3, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1
+; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -4370,8 +4210,8 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4425,8 +4265,8 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4465,7 +4305,6 @@ define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -4477,13 +4316,14 @@ define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4500,8 +4340,8 @@ define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4577,8 +4417,8 @@ define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4638,8 +4478,8 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4680,7 +4520,6 @@ define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -4692,13 +4531,14 @@ define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4715,8 +4555,8 @@ define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4775,8 +4615,8 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4829,8 +4669,8 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4869,7 +4709,6 @@ define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -4881,13 +4720,14 @@ define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4903,8 +4743,8 @@ define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4979,8 +4819,8 @@ define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5039,8 +4879,8 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5081,7 +4921,6 @@ define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -5093,13 +4932,14 @@ define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -5115,8 +4955,8 @@ define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5159,12 +4999,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi176:
+; NoVLX-NEXT: .Lcfi136:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi177:
+; NoVLX-NEXT: .Lcfi137:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi178:
+; NoVLX-NEXT: .Lcfi138:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -5202,12 +5042,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi179:
+; NoVLX-NEXT: .Lcfi139:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi180:
+; NoVLX-NEXT: .Lcfi140:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi181:
+; NoVLX-NEXT: .Lcfi141:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -5247,16 +5087,15 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi182:
+; NoVLX-NEXT: .Lcfi142:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi183:
+; NoVLX-NEXT: .Lcfi143:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi184:
+; NoVLX-NEXT: .Lcfi144:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -5268,13 +5107,14 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -5312,12 +5152,12 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi185:
+; NoVLX-NEXT: .Lcfi145:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi186:
+; NoVLX-NEXT: .Lcfi146:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi187:
+; NoVLX-NEXT: .Lcfi147:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -5378,12 +5218,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi188:
+; NoVLX-NEXT: .Lcfi148:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi189:
+; NoVLX-NEXT: .Lcfi149:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi190:
+; NoVLX-NEXT: .Lcfi150:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -5425,17 +5265,16 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi191:
+; NoVLX-NEXT: .Lcfi151:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi192:
+; NoVLX-NEXT: .Lcfi152:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi193:
+; NoVLX-NEXT: .Lcfi153:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -5447,13 +5286,14 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -5493,20 +5333,20 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi194:
+; NoVLX-NEXT: .Lcfi154:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi195:
+; NoVLX-NEXT: .Lcfi155:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi196:
+; NoVLX-NEXT: .Lcfi156:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -5542,20 +5382,20 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi197:
+; NoVLX-NEXT: .Lcfi157:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi198:
+; NoVLX-NEXT: .Lcfi158:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi199:
+; NoVLX-NEXT: .Lcfi159:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -5593,19 +5433,18 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi200:
+; NoVLX-NEXT: .Lcfi160:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi201:
+; NoVLX-NEXT: .Lcfi161:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi202:
+; NoVLX-NEXT: .Lcfi162:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -5618,13 +5457,14 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -5664,19 +5504,18 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi203:
+; NoVLX-NEXT: .Lcfi163:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi204:
+; NoVLX-NEXT: .Lcfi164:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi205:
+; NoVLX-NEXT: .Lcfi165:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -5689,13 +5528,14 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -5736,12 +5576,12 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi206:
+; NoVLX-NEXT: .Lcfi166:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi207:
+; NoVLX-NEXT: .Lcfi167:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi208:
+; NoVLX-NEXT: .Lcfi168:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -5749,8 +5589,8 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -5789,12 +5629,12 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi209:
+; NoVLX-NEXT: .Lcfi169:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi210:
+; NoVLX-NEXT: .Lcfi170:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi211:
+; NoVLX-NEXT: .Lcfi171:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -5802,7 +5642,6 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -5815,13 +5654,14 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -6052,12 +5892,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi212:
+; NoVLX-NEXT: .Lcfi172:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi213:
+; NoVLX-NEXT: .Lcfi173:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi214:
+; NoVLX-NEXT: .Lcfi174:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -6127,12 +5967,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi215:
+; NoVLX-NEXT: .Lcfi175:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi216:
+; NoVLX-NEXT: .Lcfi176:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi217:
+; NoVLX-NEXT: .Lcfi177:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -6204,12 +6044,12 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi218:
+; NoVLX-NEXT: .Lcfi178:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi219:
+; NoVLX-NEXT: .Lcfi179:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi220:
+; NoVLX-NEXT: .Lcfi180:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -6284,12 +6124,12 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi221:
+; NoVLX-NEXT: .Lcfi181:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi222:
+; NoVLX-NEXT: .Lcfi182:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi223:
+; NoVLX-NEXT: .Lcfi183:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -6365,12 +6205,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi224:
+; NoVLX-NEXT: .Lcfi184:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi225:
+; NoVLX-NEXT: .Lcfi185:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi226:
+; NoVLX-NEXT: .Lcfi186:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -6443,12 +6283,12 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi227:
+; NoVLX-NEXT: .Lcfi187:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi228:
+; NoVLX-NEXT: .Lcfi188:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi229:
+; NoVLX-NEXT: .Lcfi189:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -6525,55 +6365,55 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi230:
+; NoVLX-NEXT: .Lcfi190:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi231:
+; NoVLX-NEXT: .Lcfi191:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi232:
+; NoVLX-NEXT: .Lcfi192:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6605,55 +6445,55 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi233:
+; NoVLX-NEXT: .Lcfi193:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi234:
+; NoVLX-NEXT: .Lcfi194:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi235:
+; NoVLX-NEXT: .Lcfi195:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6687,12 +6527,12 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi236:
+; NoVLX-NEXT: .Lcfi196:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi237:
+; NoVLX-NEXT: .Lcfi197:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi238:
+; NoVLX-NEXT: .Lcfi198:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -6701,43 +6541,43 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6772,12 +6612,12 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi239:
+; NoVLX-NEXT: .Lcfi199:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi240:
+; NoVLX-NEXT: .Lcfi200:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi241:
+; NoVLX-NEXT: .Lcfi201:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -6786,43 +6626,43 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6858,55 +6698,55 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi242:
+; NoVLX-NEXT: .Lcfi202:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi243:
+; NoVLX-NEXT: .Lcfi203:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi244:
+; NoVLX-NEXT: .Lcfi204:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6941,12 +6781,12 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi245:
+; NoVLX-NEXT: .Lcfi205:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi246:
+; NoVLX-NEXT: .Lcfi206:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi247:
+; NoVLX-NEXT: .Lcfi207:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -6955,43 +6795,43 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -7028,93 +6868,78 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi248:
+; NoVLX-NEXT: .Lcfi208:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi249:
+; NoVLX-NEXT: .Lcfi209:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi250:
+; NoVLX-NEXT: .Lcfi210:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi251:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi252:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi253:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi254:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi255:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7123,12 +6948,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -7151,93 +6971,78 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi256:
+; NoVLX-NEXT: .Lcfi211:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi257:
+; NoVLX-NEXT: .Lcfi212:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi258:
+; NoVLX-NEXT: .Lcfi213:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi259:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi260:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi261:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi262:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi263:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7246,12 +7051,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -7276,94 +7076,79 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi264:
+; NoVLX-NEXT: .Lcfi214:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi265:
+; NoVLX-NEXT: .Lcfi215:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi266:
+; NoVLX-NEXT: .Lcfi216:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi267:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi268:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi269:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi270:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi271:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7372,12 +7157,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -7403,94 +7183,79 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi272:
+; NoVLX-NEXT: .Lcfi217:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi273:
+; NoVLX-NEXT: .Lcfi218:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi274:
+; NoVLX-NEXT: .Lcfi219:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi275:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi276:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi277:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi278:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi279:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7499,12 +7264,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -7531,93 +7291,78 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi280:
+; NoVLX-NEXT: .Lcfi220:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi281:
+; NoVLX-NEXT: .Lcfi221:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi282:
+; NoVLX-NEXT: .Lcfi222:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi283:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi284:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi285:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi286:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi287:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7626,12 +7371,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -7657,94 +7397,79 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi288:
+; NoVLX-NEXT: .Lcfi223:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi289:
+; NoVLX-NEXT: .Lcfi224:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi290:
+; NoVLX-NEXT: .Lcfi225:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi291:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi292:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi293:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi294:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi295:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7753,12 +7478,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -7786,12 +7506,12 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi296:
+; NoVLX-NEXT: .Lcfi226:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi297:
+; NoVLX-NEXT: .Lcfi227:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi298:
+; NoVLX-NEXT: .Lcfi228:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -7800,21 +7520,17 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi299:
+; NoVLX-NEXT: .Lcfi229:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi300:
+; NoVLX-NEXT: .Lcfi230:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi301:
+; NoVLX-NEXT: .Lcfi231:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi302:
+; NoVLX-NEXT: .Lcfi232:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi303:
+; NoVLX-NEXT: .Lcfi233:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -7857,11 +7573,11 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -7873,11 +7589,15 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -7914,12 +7634,12 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi304:
+; NoVLX-NEXT: .Lcfi234:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi305:
+; NoVLX-NEXT: .Lcfi235:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi306:
+; NoVLX-NEXT: .Lcfi236:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -7928,21 +7648,17 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi307:
+; NoVLX-NEXT: .Lcfi237:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi308:
+; NoVLX-NEXT: .Lcfi238:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi309:
+; NoVLX-NEXT: .Lcfi239:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi310:
+; NoVLX-NEXT: .Lcfi240:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi311:
+; NoVLX-NEXT: .Lcfi241:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -7985,11 +7701,11 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8001,11 +7717,15 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8044,12 +7764,12 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi312:
+; NoVLX-NEXT: .Lcfi242:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi313:
+; NoVLX-NEXT: .Lcfi243:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi314:
+; NoVLX-NEXT: .Lcfi244:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -8058,22 +7778,18 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi315:
+; NoVLX-NEXT: .Lcfi245:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi316:
+; NoVLX-NEXT: .Lcfi246:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi317:
+; NoVLX-NEXT: .Lcfi247:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi318:
+; NoVLX-NEXT: .Lcfi248:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi319:
+; NoVLX-NEXT: .Lcfi249:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -8116,11 +7832,11 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8132,11 +7848,15 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8176,12 +7896,12 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi320:
+; NoVLX-NEXT: .Lcfi250:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi321:
+; NoVLX-NEXT: .Lcfi251:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi322:
+; NoVLX-NEXT: .Lcfi252:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -8190,22 +7910,18 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi323:
+; NoVLX-NEXT: .Lcfi253:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi324:
+; NoVLX-NEXT: .Lcfi254:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi325:
+; NoVLX-NEXT: .Lcfi255:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi326:
+; NoVLX-NEXT: .Lcfi256:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi327:
+; NoVLX-NEXT: .Lcfi257:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -8248,11 +7964,11 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8264,11 +7980,15 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8309,12 +8029,12 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi328:
+; NoVLX-NEXT: .Lcfi258:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi329:
+; NoVLX-NEXT: .Lcfi259:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi330:
+; NoVLX-NEXT: .Lcfi260:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -8323,21 +8043,17 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi331:
+; NoVLX-NEXT: .Lcfi261:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi332:
+; NoVLX-NEXT: .Lcfi262:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi333:
+; NoVLX-NEXT: .Lcfi263:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi334:
+; NoVLX-NEXT: .Lcfi264:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi335:
+; NoVLX-NEXT: .Lcfi265:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -8380,11 +8096,11 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8396,11 +8112,15 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8440,12 +8160,12 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi336:
+; NoVLX-NEXT: .Lcfi266:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi337:
+; NoVLX-NEXT: .Lcfi267:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi338:
+; NoVLX-NEXT: .Lcfi268:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -8454,22 +8174,18 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi339:
+; NoVLX-NEXT: .Lcfi269:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi340:
+; NoVLX-NEXT: .Lcfi270:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi341:
+; NoVLX-NEXT: .Lcfi271:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi342:
+; NoVLX-NEXT: .Lcfi272:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi343:
+; NoVLX-NEXT: .Lcfi273:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -8512,11 +8228,11 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8528,11 +8244,15 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8629,7 +8349,6 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -8637,9 +8356,10 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8744,7 +8464,6 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -8752,9 +8471,10 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -9353,12 +9073,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi344:
+; NoVLX-NEXT: .Lcfi274:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi345:
+; NoVLX-NEXT: .Lcfi275:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi346:
+; NoVLX-NEXT: .Lcfi276:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -9396,12 +9116,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi347:
+; NoVLX-NEXT: .Lcfi277:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi348:
+; NoVLX-NEXT: .Lcfi278:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi349:
+; NoVLX-NEXT: .Lcfi279:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -9441,16 +9161,15 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi350:
+; NoVLX-NEXT: .Lcfi280:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi351:
+; NoVLX-NEXT: .Lcfi281:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi352:
+; NoVLX-NEXT: .Lcfi282:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -9458,9 +9177,10 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -9498,12 +9218,12 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi353:
+; NoVLX-NEXT: .Lcfi283:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi354:
+; NoVLX-NEXT: .Lcfi284:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi355:
+; NoVLX-NEXT: .Lcfi285:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -9556,12 +9276,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi356:
+; NoVLX-NEXT: .Lcfi286:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi357:
+; NoVLX-NEXT: .Lcfi287:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi358:
+; NoVLX-NEXT: .Lcfi288:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -9603,17 +9323,16 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi359:
+; NoVLX-NEXT: .Lcfi289:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi360:
+; NoVLX-NEXT: .Lcfi290:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi361:
+; NoVLX-NEXT: .Lcfi291:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -9621,9 +9340,10 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -9663,20 +9383,20 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi362:
+; NoVLX-NEXT: .Lcfi292:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi363:
+; NoVLX-NEXT: .Lcfi293:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi364:
+; NoVLX-NEXT: .Lcfi294:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -9712,20 +9432,20 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi365:
+; NoVLX-NEXT: .Lcfi295:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi366:
+; NoVLX-NEXT: .Lcfi296:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi367:
+; NoVLX-NEXT: .Lcfi297:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -9763,12 +9483,12 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi368:
+; NoVLX-NEXT: .Lcfi298:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi369:
+; NoVLX-NEXT: .Lcfi299:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi370:
+; NoVLX-NEXT: .Lcfi300:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -9785,8 +9505,8 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -9826,12 +9546,12 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi371:
+; NoVLX-NEXT: .Lcfi301:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi372:
+; NoVLX-NEXT: .Lcfi302:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi373:
+; NoVLX-NEXT: .Lcfi303:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -9848,8 +9568,8 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -9890,12 +9610,12 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi374:
+; NoVLX-NEXT: .Lcfi304:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi375:
+; NoVLX-NEXT: .Lcfi305:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi376:
+; NoVLX-NEXT: .Lcfi306:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -9903,8 +9623,8 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -9943,12 +9663,12 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi377:
+; NoVLX-NEXT: .Lcfi307:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi378:
+; NoVLX-NEXT: .Lcfi308:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi379:
+; NoVLX-NEXT: .Lcfi309:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -9966,8 +9686,8 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -10028,8 +9748,8 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10085,8 +9805,8 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10127,7 +9847,6 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -10145,6 +9864,7 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -10162,8 +9882,8 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10206,7 +9926,6 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -10224,6 +9943,7 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -10241,8 +9961,8 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10304,8 +10024,8 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10348,7 +10068,6 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -10366,6 +10085,7 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -10383,8 +10103,8 @@ define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10445,8 +10165,8 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10501,8 +10221,8 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10543,7 +10263,6 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -10561,6 +10280,7 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -10577,8 +10297,8 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10621,7 +10341,6 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -10639,6 +10358,7 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -10655,8 +10375,8 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10717,8 +10437,8 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10761,7 +10481,6 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -10779,6 +10498,7 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -10795,8 +10515,8 @@ define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10840,12 +10560,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi380:
+; NoVLX-NEXT: .Lcfi310:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi381:
+; NoVLX-NEXT: .Lcfi311:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi382:
+; NoVLX-NEXT: .Lcfi312:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -10885,12 +10605,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi383:
+; NoVLX-NEXT: .Lcfi313:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi384:
+; NoVLX-NEXT: .Lcfi314:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi385:
+; NoVLX-NEXT: .Lcfi315:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -10932,17 +10652,16 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi386:
+; NoVLX-NEXT: .Lcfi316:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi387:
+; NoVLX-NEXT: .Lcfi317:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi388:
+; NoVLX-NEXT: .Lcfi318:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -10960,6 +10679,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -10999,17 +10719,16 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi389:
+; NoVLX-NEXT: .Lcfi319:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi390:
+; NoVLX-NEXT: .Lcfi320:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi391:
+; NoVLX-NEXT: .Lcfi321:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -11027,6 +10746,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -11067,12 +10787,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi392:
+; NoVLX-NEXT: .Lcfi322:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi393:
+; NoVLX-NEXT: .Lcfi323:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi394:
+; NoVLX-NEXT: .Lcfi324:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -11116,18 +10836,17 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi395:
+; NoVLX-NEXT: .Lcfi325:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi396:
+; NoVLX-NEXT: .Lcfi326:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi397:
+; NoVLX-NEXT: .Lcfi327:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -11145,6 +10864,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -11186,12 +10906,12 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi398:
+; NoVLX-NEXT: .Lcfi328:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi399:
+; NoVLX-NEXT: .Lcfi329:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi400:
+; NoVLX-NEXT: .Lcfi330:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -11199,8 +10919,8 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -11237,12 +10957,12 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi401:
+; NoVLX-NEXT: .Lcfi331:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi402:
+; NoVLX-NEXT: .Lcfi332:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi403:
+; NoVLX-NEXT: .Lcfi333:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -11250,8 +10970,8 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -11290,12 +11010,12 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi404:
+; NoVLX-NEXT: .Lcfi334:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi405:
+; NoVLX-NEXT: .Lcfi335:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi406:
+; NoVLX-NEXT: .Lcfi336:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -11303,7 +11023,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -11316,13 +11035,14 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -11363,12 +11083,12 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi407:
+; NoVLX-NEXT: .Lcfi337:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi408:
+; NoVLX-NEXT: .Lcfi338:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi409:
+; NoVLX-NEXT: .Lcfi339:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -11376,7 +11096,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -11389,13 +11108,14 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -11437,12 +11157,12 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi410:
+; NoVLX-NEXT: .Lcfi340:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi411:
+; NoVLX-NEXT: .Lcfi341:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi412:
+; NoVLX-NEXT: .Lcfi342:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -11451,8 +11171,8 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -11492,12 +11212,12 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi413:
+; NoVLX-NEXT: .Lcfi343:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi414:
+; NoVLX-NEXT: .Lcfi344:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi415:
+; NoVLX-NEXT: .Lcfi345:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -11506,7 +11226,6 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -11519,13 +11238,14 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -11732,12 +11452,12 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi416:
+; NoVLX-NEXT: .Lcfi346:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi417:
+; NoVLX-NEXT: .Lcfi347:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi418:
+; NoVLX-NEXT: .Lcfi348:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -11805,12 +11525,12 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi419:
+; NoVLX-NEXT: .Lcfi349:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi420:
+; NoVLX-NEXT: .Lcfi350:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi421:
+; NoVLX-NEXT: .Lcfi351:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -11880,12 +11600,12 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi422:
+; NoVLX-NEXT: .Lcfi352:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi423:
+; NoVLX-NEXT: .Lcfi353:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi424:
+; NoVLX-NEXT: .Lcfi354:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -11957,12 +11677,12 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi425:
+; NoVLX-NEXT: .Lcfi355:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi426:
+; NoVLX-NEXT: .Lcfi356:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi427:
+; NoVLX-NEXT: .Lcfi357:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -12035,12 +11755,12 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi428:
+; NoVLX-NEXT: .Lcfi358:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi429:
+; NoVLX-NEXT: .Lcfi359:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi430:
+; NoVLX-NEXT: .Lcfi360:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -12111,12 +11831,12 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi431:
+; NoVLX-NEXT: .Lcfi361:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi432:
+; NoVLX-NEXT: .Lcfi362:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi433:
+; NoVLX-NEXT: .Lcfi363:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -12190,53 +11910,53 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi434:
+; NoVLX-NEXT: .Lcfi364:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi435:
+; NoVLX-NEXT: .Lcfi365:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi436:
+; NoVLX-NEXT: .Lcfi366:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12268,53 +11988,53 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi437:
+; NoVLX-NEXT: .Lcfi367:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi438:
+; NoVLX-NEXT: .Lcfi368:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi439:
+; NoVLX-NEXT: .Lcfi369:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12348,54 +12068,54 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi440:
+; NoVLX-NEXT: .Lcfi370:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi441:
+; NoVLX-NEXT: .Lcfi371:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi442:
+; NoVLX-NEXT: .Lcfi372:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12430,54 +12150,54 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi443:
+; NoVLX-NEXT: .Lcfi373:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi444:
+; NoVLX-NEXT: .Lcfi374:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi445:
+; NoVLX-NEXT: .Lcfi375:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12513,53 +12233,53 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi446:
+; NoVLX-NEXT: .Lcfi376:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi447:
+; NoVLX-NEXT: .Lcfi377:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi448:
+; NoVLX-NEXT: .Lcfi378:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12594,54 +12314,54 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi449:
+; NoVLX-NEXT: .Lcfi379:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi450:
+; NoVLX-NEXT: .Lcfi380:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi451:
+; NoVLX-NEXT: .Lcfi381:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12677,30 +12397,15 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi452:
+; NoVLX-NEXT: .Lcfi382:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi453:
+; NoVLX-NEXT: .Lcfi383:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi454:
+; NoVLX-NEXT: .Lcfi384:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi455:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi456:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi457:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi458:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi459:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -12709,64 +12414,64 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -12775,12 +12480,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -12802,30 +12502,15 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi460:
+; NoVLX-NEXT: .Lcfi385:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi461:
+; NoVLX-NEXT: .Lcfi386:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi462:
+; NoVLX-NEXT: .Lcfi387:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi463:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi464:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi465:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi466:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi467:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -12834,64 +12519,64 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -12900,12 +12585,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -12929,30 +12609,15 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi468:
+; NoVLX-NEXT: .Lcfi388:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi469:
+; NoVLX-NEXT: .Lcfi389:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi470:
+; NoVLX-NEXT: .Lcfi390:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi471:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi472:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi473:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi474:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi475:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -12962,64 +12627,64 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -13028,12 +12693,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -13058,30 +12718,15 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi476:
+; NoVLX-NEXT: .Lcfi391:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi477:
+; NoVLX-NEXT: .Lcfi392:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi478:
+; NoVLX-NEXT: .Lcfi393:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi479:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi480:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi481:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi482:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi483:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -13091,64 +12736,64 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -13157,12 +12802,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -13188,12 +12828,12 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi484:
+; NoVLX-NEXT: .Lcfi394:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi485:
+; NoVLX-NEXT: .Lcfi395:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi486:
+; NoVLX-NEXT: .Lcfi396:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -13202,24 +12842,20 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi487:
+; NoVLX-NEXT: .Lcfi397:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi488:
+; NoVLX-NEXT: .Lcfi398:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi489:
+; NoVLX-NEXT: .Lcfi399:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi490:
+; NoVLX-NEXT: .Lcfi400:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi491:
+; NoVLX-NEXT: .Lcfi401:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -13262,11 +12898,11 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13278,11 +12914,15 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -13318,12 +12958,12 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi492:
+; NoVLX-NEXT: .Lcfi402:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi493:
+; NoVLX-NEXT: .Lcfi403:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi494:
+; NoVLX-NEXT: .Lcfi404:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -13332,24 +12972,20 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi495:
+; NoVLX-NEXT: .Lcfi405:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi496:
+; NoVLX-NEXT: .Lcfi406:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi497:
+; NoVLX-NEXT: .Lcfi407:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi498:
+; NoVLX-NEXT: .Lcfi408:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi499:
+; NoVLX-NEXT: .Lcfi409:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -13392,11 +13028,11 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13408,11 +13044,15 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -13450,12 +13090,12 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi500:
+; NoVLX-NEXT: .Lcfi410:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi501:
+; NoVLX-NEXT: .Lcfi411:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi502:
+; NoVLX-NEXT: .Lcfi412:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -13464,25 +13104,21 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi503:
+; NoVLX-NEXT: .Lcfi413:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi504:
+; NoVLX-NEXT: .Lcfi414:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi505:
+; NoVLX-NEXT: .Lcfi415:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi506:
+; NoVLX-NEXT: .Lcfi416:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi507:
+; NoVLX-NEXT: .Lcfi417:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -13525,11 +13161,11 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13541,11 +13177,15 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -13584,12 +13224,12 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi508:
+; NoVLX-NEXT: .Lcfi418:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi509:
+; NoVLX-NEXT: .Lcfi419:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi510:
+; NoVLX-NEXT: .Lcfi420:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -13598,25 +13238,21 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi511:
+; NoVLX-NEXT: .Lcfi421:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi512:
+; NoVLX-NEXT: .Lcfi422:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi513:
+; NoVLX-NEXT: .Lcfi423:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi514:
+; NoVLX-NEXT: .Lcfi424:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi515:
+; NoVLX-NEXT: .Lcfi425:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -13659,11 +13295,11 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13675,11 +13311,15 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -13720,12 +13360,12 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi516:
+; NoVLX-NEXT: .Lcfi426:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi517:
+; NoVLX-NEXT: .Lcfi427:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi518:
+; NoVLX-NEXT: .Lcfi428:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -13769,12 +13409,12 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi519:
+; NoVLX-NEXT: .Lcfi429:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi520:
+; NoVLX-NEXT: .Lcfi430:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi521:
+; NoVLX-NEXT: .Lcfi431:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -13820,12 +13460,12 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi522:
+; NoVLX-NEXT: .Lcfi432:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi523:
+; NoVLX-NEXT: .Lcfi433:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi524:
+; NoVLX-NEXT: .Lcfi434:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -13881,12 +13521,12 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi525:
+; NoVLX-NEXT: .Lcfi435:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi526:
+; NoVLX-NEXT: .Lcfi436:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi527:
+; NoVLX-NEXT: .Lcfi437:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -14065,12 +13705,12 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi528:
+; NoVLX-NEXT: .Lcfi438:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi529:
+; NoVLX-NEXT: .Lcfi439:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi530:
+; NoVLX-NEXT: .Lcfi440:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14140,12 +13780,12 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi531:
+; NoVLX-NEXT: .Lcfi441:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi532:
+; NoVLX-NEXT: .Lcfi442:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi533:
+; NoVLX-NEXT: .Lcfi443:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14217,12 +13857,12 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi534:
+; NoVLX-NEXT: .Lcfi444:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi535:
+; NoVLX-NEXT: .Lcfi445:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi536:
+; NoVLX-NEXT: .Lcfi446:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14296,12 +13936,12 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi537:
+; NoVLX-NEXT: .Lcfi447:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi538:
+; NoVLX-NEXT: .Lcfi448:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi539:
+; NoVLX-NEXT: .Lcfi449:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14376,12 +14016,12 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi540:
+; NoVLX-NEXT: .Lcfi450:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi541:
+; NoVLX-NEXT: .Lcfi451:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi542:
+; NoVLX-NEXT: .Lcfi452:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14389,43 +14029,43 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14456,12 +14096,12 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi543:
+; NoVLX-NEXT: .Lcfi453:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi544:
+; NoVLX-NEXT: .Lcfi454:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi545:
+; NoVLX-NEXT: .Lcfi455:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14469,43 +14109,43 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14538,12 +14178,12 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi546:
+; NoVLX-NEXT: .Lcfi456:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi547:
+; NoVLX-NEXT: .Lcfi457:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi548:
+; NoVLX-NEXT: .Lcfi458:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14552,43 +14192,43 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14622,12 +14262,12 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi549:
+; NoVLX-NEXT: .Lcfi459:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi550:
+; NoVLX-NEXT: .Lcfi460:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi551:
+; NoVLX-NEXT: .Lcfi461:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14636,43 +14276,43 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14708,30 +14348,15 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi552:
+; NoVLX-NEXT: .Lcfi462:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi553:
+; NoVLX-NEXT: .Lcfi463:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi554:
+; NoVLX-NEXT: .Lcfi464:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi555:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi556:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi557:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi558:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi559:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14740,64 +14365,64 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -14806,12 +14431,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -14834,30 +14454,15 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi560:
+; NoVLX-NEXT: .Lcfi465:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi561:
+; NoVLX-NEXT: .Lcfi466:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi562:
+; NoVLX-NEXT: .Lcfi467:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi563:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi564:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi565:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi566:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi567:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14866,64 +14471,64 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -14932,12 +14537,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -14962,30 +14562,15 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi568:
+; NoVLX-NEXT: .Lcfi468:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi569:
+; NoVLX-NEXT: .Lcfi469:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi570:
+; NoVLX-NEXT: .Lcfi470:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi571:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi572:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi573:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi574:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi575:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -14995,64 +14580,64 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -15061,12 +14646,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -15092,30 +14672,15 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi576:
+; NoVLX-NEXT: .Lcfi471:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi577:
+; NoVLX-NEXT: .Lcfi472:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi578:
+; NoVLX-NEXT: .Lcfi473:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi579:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi580:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi581:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi582:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi583:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -15125,64 +14690,64 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -15191,12 +14756,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -15223,12 +14783,12 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi584:
+; NoVLX-NEXT: .Lcfi474:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi585:
+; NoVLX-NEXT: .Lcfi475:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi586:
+; NoVLX-NEXT: .Lcfi476:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -15237,24 +14797,20 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi587:
+; NoVLX-NEXT: .Lcfi477:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi588:
+; NoVLX-NEXT: .Lcfi478:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi589:
+; NoVLX-NEXT: .Lcfi479:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi590:
+; NoVLX-NEXT: .Lcfi480:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi591:
+; NoVLX-NEXT: .Lcfi481:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -15297,11 +14853,11 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15313,11 +14869,15 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15354,12 +14914,12 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi592:
+; NoVLX-NEXT: .Lcfi482:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi593:
+; NoVLX-NEXT: .Lcfi483:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi594:
+; NoVLX-NEXT: .Lcfi484:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -15368,24 +14928,20 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi595:
+; NoVLX-NEXT: .Lcfi485:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi596:
+; NoVLX-NEXT: .Lcfi486:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi597:
+; NoVLX-NEXT: .Lcfi487:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi598:
+; NoVLX-NEXT: .Lcfi488:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi599:
+; NoVLX-NEXT: .Lcfi489:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -15428,11 +14984,11 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15444,11 +15000,15 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15487,12 +15047,12 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi600:
+; NoVLX-NEXT: .Lcfi490:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi601:
+; NoVLX-NEXT: .Lcfi491:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi602:
+; NoVLX-NEXT: .Lcfi492:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -15501,25 +15061,21 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi603:
+; NoVLX-NEXT: .Lcfi493:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi604:
+; NoVLX-NEXT: .Lcfi494:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi605:
+; NoVLX-NEXT: .Lcfi495:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi606:
+; NoVLX-NEXT: .Lcfi496:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi607:
+; NoVLX-NEXT: .Lcfi497:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -15562,11 +15118,11 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15578,11 +15134,15 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15622,12 +15182,12 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi608:
+; NoVLX-NEXT: .Lcfi498:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi609:
+; NoVLX-NEXT: .Lcfi499:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi610:
+; NoVLX-NEXT: .Lcfi500:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -15636,25 +15196,21 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi611:
+; NoVLX-NEXT: .Lcfi501:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi612:
+; NoVLX-NEXT: .Lcfi502:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi613:
+; NoVLX-NEXT: .Lcfi503:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi614:
+; NoVLX-NEXT: .Lcfi504:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi615:
+; NoVLX-NEXT: .Lcfi505:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -15697,11 +15253,11 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15713,11 +15269,15 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15758,62 +15318,58 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi616:
+; NoVLX-NEXT: .Lcfi506:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi617:
+; NoVLX-NEXT: .Lcfi507:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi618:
+; NoVLX-NEXT: .Lcfi508:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm5
@@ -15821,190 +15377,194 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
; NoVLX-NEXT: vmovq %xmm6, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
@@ -16109,69 +15669,68 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi619:
+; NoVLX-NEXT: .Lcfi509:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi620:
+; NoVLX-NEXT: .Lcfi510:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi621:
+; NoVLX-NEXT: .Lcfi511:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
@@ -16179,7 +15738,8 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
@@ -16189,19 +15749,19 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1
; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -16377,12 +15937,12 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi622:
+; NoVLX-NEXT: .Lcfi512:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi623:
+; NoVLX-NEXT: .Lcfi513:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi624:
+; NoVLX-NEXT: .Lcfi514:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -16393,17 +15953,12 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rdx
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16411,9 +15966,10 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm4
@@ -16431,39 +15987,40 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
@@ -16471,69 +16028,72 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT: vmovq %xmm5, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm6
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT: vmovq %xmm6, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm7
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm2
@@ -16542,18 +16102,13 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4
; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16562,147 +16117,152 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm1, %ymm2
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -16740,12 +16300,12 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi625:
+; NoVLX-NEXT: .Lcfi515:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi626:
+; NoVLX-NEXT: .Lcfi516:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi627:
+; NoVLX-NEXT: .Lcfi517:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -16757,8 +16317,6 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
@@ -16771,19 +16329,20 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16791,6 +16350,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
@@ -16813,174 +16373,174 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2
-; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm2, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm3, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1
+; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -17035,8 +16595,8 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17090,8 +16650,8 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17130,7 +16690,6 @@ define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -17142,13 +16701,14 @@ define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17165,8 +16725,8 @@ define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17242,8 +16802,8 @@ define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17303,8 +16863,8 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17345,7 +16905,6 @@ define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -17357,13 +16916,14 @@ define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17380,8 +16940,8 @@ define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17440,8 +17000,8 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17494,8 +17054,8 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17534,7 +17094,6 @@ define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -17546,13 +17105,14 @@ define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17568,8 +17128,8 @@ define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17644,8 +17204,8 @@ define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17704,8 +17264,8 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17746,7 +17306,6 @@ define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -17758,13 +17317,14 @@ define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17780,8 +17340,8 @@ define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17824,12 +17384,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi628:
+; NoVLX-NEXT: .Lcfi518:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi629:
+; NoVLX-NEXT: .Lcfi519:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi630:
+; NoVLX-NEXT: .Lcfi520:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -17867,12 +17427,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi631:
+; NoVLX-NEXT: .Lcfi521:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi632:
+; NoVLX-NEXT: .Lcfi522:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi633:
+; NoVLX-NEXT: .Lcfi523:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -17912,16 +17472,15 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi634:
+; NoVLX-NEXT: .Lcfi524:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi635:
+; NoVLX-NEXT: .Lcfi525:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi636:
+; NoVLX-NEXT: .Lcfi526:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -17933,13 +17492,14 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -17977,12 +17537,12 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi637:
+; NoVLX-NEXT: .Lcfi527:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi638:
+; NoVLX-NEXT: .Lcfi528:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi639:
+; NoVLX-NEXT: .Lcfi529:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -18043,12 +17603,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi640:
+; NoVLX-NEXT: .Lcfi530:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi641:
+; NoVLX-NEXT: .Lcfi531:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi642:
+; NoVLX-NEXT: .Lcfi532:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -18090,17 +17650,16 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi643:
+; NoVLX-NEXT: .Lcfi533:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi644:
+; NoVLX-NEXT: .Lcfi534:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi645:
+; NoVLX-NEXT: .Lcfi535:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -18112,13 +17671,14 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -18158,20 +17718,20 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi646:
+; NoVLX-NEXT: .Lcfi536:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi647:
+; NoVLX-NEXT: .Lcfi537:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi648:
+; NoVLX-NEXT: .Lcfi538:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -18207,20 +17767,20 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi649:
+; NoVLX-NEXT: .Lcfi539:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi650:
+; NoVLX-NEXT: .Lcfi540:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi651:
+; NoVLX-NEXT: .Lcfi541:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -18258,19 +17818,18 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi652:
+; NoVLX-NEXT: .Lcfi542:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi653:
+; NoVLX-NEXT: .Lcfi543:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi654:
+; NoVLX-NEXT: .Lcfi544:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -18283,13 +17842,14 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -18329,19 +17889,18 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi655:
+; NoVLX-NEXT: .Lcfi545:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi656:
+; NoVLX-NEXT: .Lcfi546:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi657:
+; NoVLX-NEXT: .Lcfi547:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -18354,13 +17913,14 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -18401,12 +17961,12 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi658:
+; NoVLX-NEXT: .Lcfi548:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi659:
+; NoVLX-NEXT: .Lcfi549:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi660:
+; NoVLX-NEXT: .Lcfi550:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -18414,8 +17974,8 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -18454,12 +18014,12 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi661:
+; NoVLX-NEXT: .Lcfi551:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi662:
+; NoVLX-NEXT: .Lcfi552:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi663:
+; NoVLX-NEXT: .Lcfi553:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -18467,7 +18027,6 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -18480,13 +18039,14 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -18717,12 +18277,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi664:
+; NoVLX-NEXT: .Lcfi554:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi665:
+; NoVLX-NEXT: .Lcfi555:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi666:
+; NoVLX-NEXT: .Lcfi556:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -18792,12 +18352,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi667:
+; NoVLX-NEXT: .Lcfi557:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi668:
+; NoVLX-NEXT: .Lcfi558:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi669:
+; NoVLX-NEXT: .Lcfi559:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -18869,12 +18429,12 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi670:
+; NoVLX-NEXT: .Lcfi560:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi671:
+; NoVLX-NEXT: .Lcfi561:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi672:
+; NoVLX-NEXT: .Lcfi562:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -18949,12 +18509,12 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi673:
+; NoVLX-NEXT: .Lcfi563:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi674:
+; NoVLX-NEXT: .Lcfi564:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi675:
+; NoVLX-NEXT: .Lcfi565:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -19030,12 +18590,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi676:
+; NoVLX-NEXT: .Lcfi566:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi677:
+; NoVLX-NEXT: .Lcfi567:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi678:
+; NoVLX-NEXT: .Lcfi568:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -19108,12 +18668,12 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi679:
+; NoVLX-NEXT: .Lcfi569:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi680:
+; NoVLX-NEXT: .Lcfi570:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi681:
+; NoVLX-NEXT: .Lcfi571:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -19190,55 +18750,55 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi682:
+; NoVLX-NEXT: .Lcfi572:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi683:
+; NoVLX-NEXT: .Lcfi573:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi684:
+; NoVLX-NEXT: .Lcfi574:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19270,55 +18830,55 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi685:
+; NoVLX-NEXT: .Lcfi575:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi686:
+; NoVLX-NEXT: .Lcfi576:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi687:
+; NoVLX-NEXT: .Lcfi577:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19352,12 +18912,12 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi688:
+; NoVLX-NEXT: .Lcfi578:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi689:
+; NoVLX-NEXT: .Lcfi579:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi690:
+; NoVLX-NEXT: .Lcfi580:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -19366,43 +18926,43 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19437,12 +18997,12 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi691:
+; NoVLX-NEXT: .Lcfi581:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi692:
+; NoVLX-NEXT: .Lcfi582:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi693:
+; NoVLX-NEXT: .Lcfi583:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -19451,43 +19011,43 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19523,55 +19083,55 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi694:
+; NoVLX-NEXT: .Lcfi584:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi695:
+; NoVLX-NEXT: .Lcfi585:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi696:
+; NoVLX-NEXT: .Lcfi586:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19606,12 +19166,12 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi697:
+; NoVLX-NEXT: .Lcfi587:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi698:
+; NoVLX-NEXT: .Lcfi588:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi699:
+; NoVLX-NEXT: .Lcfi589:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -19620,43 +19180,43 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19693,93 +19253,78 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi700:
+; NoVLX-NEXT: .Lcfi590:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi701:
+; NoVLX-NEXT: .Lcfi591:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi702:
+; NoVLX-NEXT: .Lcfi592:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi703:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi704:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi705:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi706:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi707:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -19788,12 +19333,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -19816,93 +19356,78 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi708:
+; NoVLX-NEXT: .Lcfi593:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi709:
+; NoVLX-NEXT: .Lcfi594:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi710:
+; NoVLX-NEXT: .Lcfi595:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi711:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi712:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi713:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi714:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi715:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -19911,12 +19436,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -19941,94 +19461,79 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi716:
+; NoVLX-NEXT: .Lcfi596:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi717:
+; NoVLX-NEXT: .Lcfi597:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi718:
+; NoVLX-NEXT: .Lcfi598:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi719:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi720:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi721:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi722:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi723:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20037,12 +19542,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -20068,94 +19568,79 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi724:
+; NoVLX-NEXT: .Lcfi599:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi725:
+; NoVLX-NEXT: .Lcfi600:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi726:
+; NoVLX-NEXT: .Lcfi601:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi727:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi728:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi729:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi730:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi731:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20164,12 +19649,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -20196,93 +19676,78 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi732:
+; NoVLX-NEXT: .Lcfi602:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi733:
+; NoVLX-NEXT: .Lcfi603:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi734:
+; NoVLX-NEXT: .Lcfi604:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi735:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi736:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi737:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi738:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi739:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20291,12 +19756,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -20322,94 +19782,79 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi740:
+; NoVLX-NEXT: .Lcfi605:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi741:
+; NoVLX-NEXT: .Lcfi606:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi742:
+; NoVLX-NEXT: .Lcfi607:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi743:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi744:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi745:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi746:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi747:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20418,12 +19863,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -20451,12 +19891,12 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi748:
+; NoVLX-NEXT: .Lcfi608:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi749:
+; NoVLX-NEXT: .Lcfi609:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi750:
+; NoVLX-NEXT: .Lcfi610:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -20465,21 +19905,17 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi751:
+; NoVLX-NEXT: .Lcfi611:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi752:
+; NoVLX-NEXT: .Lcfi612:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi753:
+; NoVLX-NEXT: .Lcfi613:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi754:
+; NoVLX-NEXT: .Lcfi614:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi755:
+; NoVLX-NEXT: .Lcfi615:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -20522,11 +19958,11 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20538,11 +19974,15 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -20579,12 +20019,12 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi756:
+; NoVLX-NEXT: .Lcfi616:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi757:
+; NoVLX-NEXT: .Lcfi617:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi758:
+; NoVLX-NEXT: .Lcfi618:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -20593,21 +20033,17 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi759:
+; NoVLX-NEXT: .Lcfi619:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi760:
+; NoVLX-NEXT: .Lcfi620:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi761:
+; NoVLX-NEXT: .Lcfi621:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi762:
+; NoVLX-NEXT: .Lcfi622:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi763:
+; NoVLX-NEXT: .Lcfi623:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -20650,11 +20086,11 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20666,11 +20102,15 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -20709,12 +20149,12 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi764:
+; NoVLX-NEXT: .Lcfi624:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi765:
+; NoVLX-NEXT: .Lcfi625:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi766:
+; NoVLX-NEXT: .Lcfi626:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -20723,22 +20163,18 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi767:
+; NoVLX-NEXT: .Lcfi627:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi768:
+; NoVLX-NEXT: .Lcfi628:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi769:
+; NoVLX-NEXT: .Lcfi629:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi770:
+; NoVLX-NEXT: .Lcfi630:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi771:
+; NoVLX-NEXT: .Lcfi631:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -20781,11 +20217,11 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20797,11 +20233,15 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -20841,12 +20281,12 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi772:
+; NoVLX-NEXT: .Lcfi632:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi773:
+; NoVLX-NEXT: .Lcfi633:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi774:
+; NoVLX-NEXT: .Lcfi634:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -20855,22 +20295,18 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi775:
+; NoVLX-NEXT: .Lcfi635:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi776:
+; NoVLX-NEXT: .Lcfi636:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi777:
+; NoVLX-NEXT: .Lcfi637:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi778:
+; NoVLX-NEXT: .Lcfi638:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi779:
+; NoVLX-NEXT: .Lcfi639:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -20913,11 +20349,11 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20929,11 +20365,15 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -20974,12 +20414,12 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi780:
+; NoVLX-NEXT: .Lcfi640:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi781:
+; NoVLX-NEXT: .Lcfi641:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi782:
+; NoVLX-NEXT: .Lcfi642:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -20988,21 +20428,17 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi783:
+; NoVLX-NEXT: .Lcfi643:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi784:
+; NoVLX-NEXT: .Lcfi644:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi785:
+; NoVLX-NEXT: .Lcfi645:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi786:
+; NoVLX-NEXT: .Lcfi646:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi787:
+; NoVLX-NEXT: .Lcfi647:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -21045,11 +20481,11 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -21061,11 +20497,15 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -21105,12 +20545,12 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi788:
+; NoVLX-NEXT: .Lcfi648:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi789:
+; NoVLX-NEXT: .Lcfi649:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi790:
+; NoVLX-NEXT: .Lcfi650:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -21119,22 +20559,18 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi791:
+; NoVLX-NEXT: .Lcfi651:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi792:
+; NoVLX-NEXT: .Lcfi652:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi793:
+; NoVLX-NEXT: .Lcfi653:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi794:
+; NoVLX-NEXT: .Lcfi654:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi795:
+; NoVLX-NEXT: .Lcfi655:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -21177,11 +20613,11 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -21193,11 +20629,15 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -21294,7 +20734,6 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -21302,9 +20741,10 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -21409,7 +20849,6 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -21417,9 +20856,10 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -22018,12 +21458,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi796:
+; NoVLX-NEXT: .Lcfi656:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi797:
+; NoVLX-NEXT: .Lcfi657:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi798:
+; NoVLX-NEXT: .Lcfi658:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -22061,12 +21501,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi799:
+; NoVLX-NEXT: .Lcfi659:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi800:
+; NoVLX-NEXT: .Lcfi660:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi801:
+; NoVLX-NEXT: .Lcfi661:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -22106,16 +21546,15 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi802:
+; NoVLX-NEXT: .Lcfi662:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi803:
+; NoVLX-NEXT: .Lcfi663:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi804:
+; NoVLX-NEXT: .Lcfi664:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -22123,9 +21562,10 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -22163,12 +21603,12 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi805:
+; NoVLX-NEXT: .Lcfi665:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi806:
+; NoVLX-NEXT: .Lcfi666:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi807:
+; NoVLX-NEXT: .Lcfi667:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -22221,12 +21661,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi808:
+; NoVLX-NEXT: .Lcfi668:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi809:
+; NoVLX-NEXT: .Lcfi669:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi810:
+; NoVLX-NEXT: .Lcfi670:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -22268,17 +21708,16 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi811:
+; NoVLX-NEXT: .Lcfi671:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi812:
+; NoVLX-NEXT: .Lcfi672:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi813:
+; NoVLX-NEXT: .Lcfi673:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -22286,9 +21725,10 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -22328,20 +21768,20 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi814:
+; NoVLX-NEXT: .Lcfi674:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi815:
+; NoVLX-NEXT: .Lcfi675:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi816:
+; NoVLX-NEXT: .Lcfi676:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -22377,20 +21817,20 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi817:
+; NoVLX-NEXT: .Lcfi677:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi818:
+; NoVLX-NEXT: .Lcfi678:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi819:
+; NoVLX-NEXT: .Lcfi679:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -22428,12 +21868,12 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi820:
+; NoVLX-NEXT: .Lcfi680:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi821:
+; NoVLX-NEXT: .Lcfi681:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi822:
+; NoVLX-NEXT: .Lcfi682:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -22450,8 +21890,8 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -22491,12 +21931,12 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi823:
+; NoVLX-NEXT: .Lcfi683:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi824:
+; NoVLX-NEXT: .Lcfi684:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi825:
+; NoVLX-NEXT: .Lcfi685:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -22513,8 +21953,8 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -22555,12 +21995,12 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi826:
+; NoVLX-NEXT: .Lcfi686:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi827:
+; NoVLX-NEXT: .Lcfi687:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi828:
+; NoVLX-NEXT: .Lcfi688:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -22568,8 +22008,8 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -22608,12 +22048,12 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi829:
+; NoVLX-NEXT: .Lcfi689:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi830:
+; NoVLX-NEXT: .Lcfi690:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi831:
+; NoVLX-NEXT: .Lcfi691:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -22631,8 +22071,8 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -22693,8 +22133,8 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22750,8 +22190,8 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22792,7 +22232,6 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -22810,6 +22249,7 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -22827,8 +22267,8 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22871,7 +22311,6 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -22889,6 +22328,7 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -22906,8 +22346,8 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22969,8 +22409,8 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -23013,7 +22453,6 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -23031,6 +22470,7 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -23048,8 +22488,8 @@ define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -23110,8 +22550,8 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23166,8 +22606,8 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23208,7 +22648,6 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -23226,6 +22665,7 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -23242,8 +22682,8 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23286,7 +22726,6 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -23304,6 +22743,7 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -23320,8 +22760,8 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23382,8 +22822,8 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23426,7 +22866,6 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -23444,6 +22883,7 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -23460,8 +22900,8 @@ define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23505,12 +22945,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi832:
+; NoVLX-NEXT: .Lcfi692:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi833:
+; NoVLX-NEXT: .Lcfi693:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi834:
+; NoVLX-NEXT: .Lcfi694:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -23550,12 +22990,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi835:
+; NoVLX-NEXT: .Lcfi695:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi836:
+; NoVLX-NEXT: .Lcfi696:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi837:
+; NoVLX-NEXT: .Lcfi697:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -23597,17 +23037,16 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi838:
+; NoVLX-NEXT: .Lcfi698:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi839:
+; NoVLX-NEXT: .Lcfi699:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi840:
+; NoVLX-NEXT: .Lcfi700:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -23625,6 +23064,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -23664,17 +23104,16 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi841:
+; NoVLX-NEXT: .Lcfi701:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi842:
+; NoVLX-NEXT: .Lcfi702:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi843:
+; NoVLX-NEXT: .Lcfi703:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -23692,6 +23131,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -23732,12 +23172,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi844:
+; NoVLX-NEXT: .Lcfi704:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi845:
+; NoVLX-NEXT: .Lcfi705:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi846:
+; NoVLX-NEXT: .Lcfi706:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -23781,18 +23221,17 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi847:
+; NoVLX-NEXT: .Lcfi707:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi848:
+; NoVLX-NEXT: .Lcfi708:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi849:
+; NoVLX-NEXT: .Lcfi709:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -23810,6 +23249,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -23851,12 +23291,12 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi850:
+; NoVLX-NEXT: .Lcfi710:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi851:
+; NoVLX-NEXT: .Lcfi711:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi852:
+; NoVLX-NEXT: .Lcfi712:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -23864,8 +23304,8 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -23902,12 +23342,12 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi853:
+; NoVLX-NEXT: .Lcfi713:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi854:
+; NoVLX-NEXT: .Lcfi714:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi855:
+; NoVLX-NEXT: .Lcfi715:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -23915,8 +23355,8 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -23955,12 +23395,12 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi856:
+; NoVLX-NEXT: .Lcfi716:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi857:
+; NoVLX-NEXT: .Lcfi717:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi858:
+; NoVLX-NEXT: .Lcfi718:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -23968,7 +23408,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -23981,13 +23420,14 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -24028,12 +23468,12 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi859:
+; NoVLX-NEXT: .Lcfi719:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi860:
+; NoVLX-NEXT: .Lcfi720:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi861:
+; NoVLX-NEXT: .Lcfi721:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -24041,7 +23481,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -24054,13 +23493,14 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -24102,12 +23542,12 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi862:
+; NoVLX-NEXT: .Lcfi722:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi863:
+; NoVLX-NEXT: .Lcfi723:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi864:
+; NoVLX-NEXT: .Lcfi724:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -24116,8 +23556,8 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -24157,12 +23597,12 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi865:
+; NoVLX-NEXT: .Lcfi725:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi866:
+; NoVLX-NEXT: .Lcfi726:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi867:
+; NoVLX-NEXT: .Lcfi727:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -24171,7 +23611,6 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -24184,13 +23623,14 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -24397,12 +23837,12 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi868:
+; NoVLX-NEXT: .Lcfi728:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi869:
+; NoVLX-NEXT: .Lcfi729:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi870:
+; NoVLX-NEXT: .Lcfi730:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -24470,12 +23910,12 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi871:
+; NoVLX-NEXT: .Lcfi731:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi872:
+; NoVLX-NEXT: .Lcfi732:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi873:
+; NoVLX-NEXT: .Lcfi733:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -24545,12 +23985,12 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi874:
+; NoVLX-NEXT: .Lcfi734:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi875:
+; NoVLX-NEXT: .Lcfi735:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi876:
+; NoVLX-NEXT: .Lcfi736:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -24622,12 +24062,12 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi877:
+; NoVLX-NEXT: .Lcfi737:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi878:
+; NoVLX-NEXT: .Lcfi738:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi879:
+; NoVLX-NEXT: .Lcfi739:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -24700,12 +24140,12 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi880:
+; NoVLX-NEXT: .Lcfi740:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi881:
+; NoVLX-NEXT: .Lcfi741:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi882:
+; NoVLX-NEXT: .Lcfi742:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -24776,12 +24216,12 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi883:
+; NoVLX-NEXT: .Lcfi743:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi884:
+; NoVLX-NEXT: .Lcfi744:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi885:
+; NoVLX-NEXT: .Lcfi745:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -24855,53 +24295,53 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi886:
+; NoVLX-NEXT: .Lcfi746:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi887:
+; NoVLX-NEXT: .Lcfi747:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi888:
+; NoVLX-NEXT: .Lcfi748:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -24933,53 +24373,53 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi889:
+; NoVLX-NEXT: .Lcfi749:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi890:
+; NoVLX-NEXT: .Lcfi750:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi891:
+; NoVLX-NEXT: .Lcfi751:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25013,54 +24453,54 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi892:
+; NoVLX-NEXT: .Lcfi752:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi893:
+; NoVLX-NEXT: .Lcfi753:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi894:
+; NoVLX-NEXT: .Lcfi754:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25095,54 +24535,54 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi895:
+; NoVLX-NEXT: .Lcfi755:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi896:
+; NoVLX-NEXT: .Lcfi756:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi897:
+; NoVLX-NEXT: .Lcfi757:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25178,53 +24618,53 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi898:
+; NoVLX-NEXT: .Lcfi758:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi899:
+; NoVLX-NEXT: .Lcfi759:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi900:
+; NoVLX-NEXT: .Lcfi760:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25259,54 +24699,54 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi901:
+; NoVLX-NEXT: .Lcfi761:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi902:
+; NoVLX-NEXT: .Lcfi762:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi903:
+; NoVLX-NEXT: .Lcfi763:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25342,30 +24782,15 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi904:
+; NoVLX-NEXT: .Lcfi764:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi905:
+; NoVLX-NEXT: .Lcfi765:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi906:
+; NoVLX-NEXT: .Lcfi766:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi907:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi908:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi909:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi910:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi911:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -25376,64 +24801,64 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25442,12 +24867,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -25469,30 +24889,15 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi912:
+; NoVLX-NEXT: .Lcfi767:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi913:
+; NoVLX-NEXT: .Lcfi768:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi914:
+; NoVLX-NEXT: .Lcfi769:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi915:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi916:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi917:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi918:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi919:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25504,64 +24909,64 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25570,12 +24975,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -25599,30 +24999,15 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi920:
+; NoVLX-NEXT: .Lcfi770:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi921:
+; NoVLX-NEXT: .Lcfi771:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi922:
+; NoVLX-NEXT: .Lcfi772:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi923:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi924:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi925:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi926:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi927:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -25634,64 +25019,64 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25700,12 +25085,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -25730,30 +25110,15 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi928:
+; NoVLX-NEXT: .Lcfi773:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi929:
+; NoVLX-NEXT: .Lcfi774:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi930:
+; NoVLX-NEXT: .Lcfi775:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi931:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi932:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi933:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi934:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi935:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25766,64 +25131,64 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25832,12 +25197,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -25863,12 +25223,12 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi936:
+; NoVLX-NEXT: .Lcfi776:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi937:
+; NoVLX-NEXT: .Lcfi777:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi938:
+; NoVLX-NEXT: .Lcfi778:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -25877,15 +25237,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi939:
+; NoVLX-NEXT: .Lcfi779:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi940:
+; NoVLX-NEXT: .Lcfi780:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi941:
+; NoVLX-NEXT: .Lcfi781:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi942:
+; NoVLX-NEXT: .Lcfi782:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi943:
+; NoVLX-NEXT: .Lcfi783:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25893,10 +25253,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -25939,11 +25295,11 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -25955,11 +25311,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25995,12 +25355,12 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi944:
+; NoVLX-NEXT: .Lcfi784:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi945:
+; NoVLX-NEXT: .Lcfi785:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi946:
+; NoVLX-NEXT: .Lcfi786:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -26009,15 +25369,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi947:
+; NoVLX-NEXT: .Lcfi787:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi948:
+; NoVLX-NEXT: .Lcfi788:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi949:
+; NoVLX-NEXT: .Lcfi789:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi950:
+; NoVLX-NEXT: .Lcfi790:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi951:
+; NoVLX-NEXT: .Lcfi791:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
@@ -26026,10 +25386,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -26072,11 +25428,11 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26088,11 +25444,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -26130,12 +25490,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi952:
+; NoVLX-NEXT: .Lcfi792:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi953:
+; NoVLX-NEXT: .Lcfi793:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi954:
+; NoVLX-NEXT: .Lcfi794:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -26144,15 +25504,15 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi955:
+; NoVLX-NEXT: .Lcfi795:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi956:
+; NoVLX-NEXT: .Lcfi796:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi957:
+; NoVLX-NEXT: .Lcfi797:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi958:
+; NoVLX-NEXT: .Lcfi798:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi959:
+; NoVLX-NEXT: .Lcfi799:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -26161,10 +25521,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -26207,11 +25563,11 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26223,11 +25579,15 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -26266,12 +25626,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi960:
+; NoVLX-NEXT: .Lcfi800:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi961:
+; NoVLX-NEXT: .Lcfi801:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi962:
+; NoVLX-NEXT: .Lcfi802:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -26280,15 +25640,15 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi963:
+; NoVLX-NEXT: .Lcfi803:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi964:
+; NoVLX-NEXT: .Lcfi804:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi965:
+; NoVLX-NEXT: .Lcfi805:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi966:
+; NoVLX-NEXT: .Lcfi806:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi967:
+; NoVLX-NEXT: .Lcfi807:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
@@ -26298,10 +25658,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -26344,11 +25700,11 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26360,11 +25716,15 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -26405,12 +25765,12 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi968:
+; NoVLX-NEXT: .Lcfi808:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi969:
+; NoVLX-NEXT: .Lcfi809:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi970:
+; NoVLX-NEXT: .Lcfi810:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -26456,12 +25816,12 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi971:
+; NoVLX-NEXT: .Lcfi811:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi972:
+; NoVLX-NEXT: .Lcfi812:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi973:
+; NoVLX-NEXT: .Lcfi813:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -26510,12 +25870,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi974:
+; NoVLX-NEXT: .Lcfi814:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi975:
+; NoVLX-NEXT: .Lcfi815:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi976:
+; NoVLX-NEXT: .Lcfi816:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -26573,12 +25933,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi977:
+; NoVLX-NEXT: .Lcfi817:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi978:
+; NoVLX-NEXT: .Lcfi818:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi979:
+; NoVLX-NEXT: .Lcfi819:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -26770,12 +26130,12 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi980:
+; NoVLX-NEXT: .Lcfi820:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi981:
+; NoVLX-NEXT: .Lcfi821:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi982:
+; NoVLX-NEXT: .Lcfi822:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -26847,12 +26207,12 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi983:
+; NoVLX-NEXT: .Lcfi823:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi984:
+; NoVLX-NEXT: .Lcfi824:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi985:
+; NoVLX-NEXT: .Lcfi825:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -26927,12 +26287,12 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi986:
+; NoVLX-NEXT: .Lcfi826:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi987:
+; NoVLX-NEXT: .Lcfi827:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi988:
+; NoVLX-NEXT: .Lcfi828:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -27008,12 +26368,12 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi989:
+; NoVLX-NEXT: .Lcfi829:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi990:
+; NoVLX-NEXT: .Lcfi830:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi991:
+; NoVLX-NEXT: .Lcfi831:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -27091,12 +26451,12 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi992:
+; NoVLX-NEXT: .Lcfi832:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi993:
+; NoVLX-NEXT: .Lcfi833:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi994:
+; NoVLX-NEXT: .Lcfi834:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27106,43 +26466,43 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27173,12 +26533,12 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi995:
+; NoVLX-NEXT: .Lcfi835:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi996:
+; NoVLX-NEXT: .Lcfi836:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi997:
+; NoVLX-NEXT: .Lcfi837:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27189,43 +26549,43 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27258,12 +26618,12 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi998:
+; NoVLX-NEXT: .Lcfi838:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi999:
+; NoVLX-NEXT: .Lcfi839:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1000:
+; NoVLX-NEXT: .Lcfi840:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27274,43 +26634,43 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27344,12 +26704,12 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1001:
+; NoVLX-NEXT: .Lcfi841:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1002:
+; NoVLX-NEXT: .Lcfi842:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1003:
+; NoVLX-NEXT: .Lcfi843:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27361,43 +26721,43 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27433,30 +26793,15 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1004:
+; NoVLX-NEXT: .Lcfi844:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1005:
+; NoVLX-NEXT: .Lcfi845:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1006:
+; NoVLX-NEXT: .Lcfi846:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1007:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1008:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1009:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1010:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1011:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
@@ -27467,64 +26812,64 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27533,12 +26878,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -27561,30 +26901,15 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1012:
+; NoVLX-NEXT: .Lcfi847:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1013:
+; NoVLX-NEXT: .Lcfi848:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1014:
+; NoVLX-NEXT: .Lcfi849:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1015:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1016:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1017:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1018:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1019:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27596,64 +26921,64 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27662,12 +26987,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -27692,30 +27012,15 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1020:
+; NoVLX-NEXT: .Lcfi850:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1021:
+; NoVLX-NEXT: .Lcfi851:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1022:
+; NoVLX-NEXT: .Lcfi852:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1023:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1024:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1025:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1026:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1027:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
@@ -27727,64 +27032,64 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27793,12 +27098,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -27824,30 +27124,15 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1028:
+; NoVLX-NEXT: .Lcfi853:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1029:
+; NoVLX-NEXT: .Lcfi854:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1030:
+; NoVLX-NEXT: .Lcfi855:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1031:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1032:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1033:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1034:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1035:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27860,64 +27145,64 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27926,12 +27211,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -27958,12 +27238,12 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1036:
+; NoVLX-NEXT: .Lcfi856:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1037:
+; NoVLX-NEXT: .Lcfi857:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1038:
+; NoVLX-NEXT: .Lcfi858:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -27972,15 +27252,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1039:
+; NoVLX-NEXT: .Lcfi859:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1040:
+; NoVLX-NEXT: .Lcfi860:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1041:
+; NoVLX-NEXT: .Lcfi861:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1042:
+; NoVLX-NEXT: .Lcfi862:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1043:
+; NoVLX-NEXT: .Lcfi863:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27988,10 +27268,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28034,11 +27310,11 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28050,11 +27326,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28091,12 +27371,12 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1044:
+; NoVLX-NEXT: .Lcfi864:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1045:
+; NoVLX-NEXT: .Lcfi865:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1046:
+; NoVLX-NEXT: .Lcfi866:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -28105,15 +27385,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1047:
+; NoVLX-NEXT: .Lcfi867:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1048:
+; NoVLX-NEXT: .Lcfi868:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1049:
+; NoVLX-NEXT: .Lcfi869:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1050:
+; NoVLX-NEXT: .Lcfi870:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1051:
+; NoVLX-NEXT: .Lcfi871:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
@@ -28122,10 +27402,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28168,11 +27444,11 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28184,11 +27460,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28227,12 +27507,12 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1052:
+; NoVLX-NEXT: .Lcfi872:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1053:
+; NoVLX-NEXT: .Lcfi873:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1054:
+; NoVLX-NEXT: .Lcfi874:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -28241,15 +27521,15 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1055:
+; NoVLX-NEXT: .Lcfi875:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1056:
+; NoVLX-NEXT: .Lcfi876:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1057:
+; NoVLX-NEXT: .Lcfi877:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1058:
+; NoVLX-NEXT: .Lcfi878:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1059:
+; NoVLX-NEXT: .Lcfi879:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28258,10 +27538,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28304,11 +27580,11 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28320,11 +27596,15 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28364,12 +27644,12 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1060:
+; NoVLX-NEXT: .Lcfi880:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1061:
+; NoVLX-NEXT: .Lcfi881:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1062:
+; NoVLX-NEXT: .Lcfi882:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -28378,15 +27658,15 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1063:
+; NoVLX-NEXT: .Lcfi883:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1064:
+; NoVLX-NEXT: .Lcfi884:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1065:
+; NoVLX-NEXT: .Lcfi885:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1066:
+; NoVLX-NEXT: .Lcfi886:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1067:
+; NoVLX-NEXT: .Lcfi887:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
@@ -28396,10 +27676,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -28442,11 +27718,11 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28458,11 +27734,15 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28503,62 +27783,58 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1068:
+; NoVLX-NEXT: .Lcfi888:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1069:
+; NoVLX-NEXT: .Lcfi889:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1070:
+; NoVLX-NEXT: .Lcfi890:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm5
@@ -28566,79 +27842,82 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5
+; NoVLX-NEXT: vmovq %xmm5, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm0
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; NoVLX-NEXT: vmovq %xmm5, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm6
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm6
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm6
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm2
@@ -28646,7 +27925,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
@@ -28654,34 +27933,35 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm8, %rcx
-; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: movl %ecx, %eax
-; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: movq %rcx, %rax
-; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %eax, %xmm7
+; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7
; NoVLX-NEXT: movq %rax, %rcx
-; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx
+; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm3
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm0
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm4
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28857,69 +28137,68 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1071:
+; NoVLX-NEXT: .Lcfi891:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1072:
+; NoVLX-NEXT: .Lcfi892:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1073:
+; NoVLX-NEXT: .Lcfi893:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
@@ -28927,7 +28206,8 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm1, %rcx
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
@@ -28937,24 +28217,24 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; NoVLX-NEXT: vmovdqa (%rdi), %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
-; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2
+; NoVLX-NEXT: vmovdqa (%rdi), %ymm0
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm1
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm2
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
@@ -29130,12 +28410,12 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1074:
+; NoVLX-NEXT: .Lcfi894:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1075:
+; NoVLX-NEXT: .Lcfi895:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1076:
+; NoVLX-NEXT: .Lcfi896:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -29146,17 +28426,12 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rdx
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -29164,59 +28439,61 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
@@ -29224,7 +28501,8 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movl %ecx, %eax
@@ -29234,29 +28512,30 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm7
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
@@ -29264,47 +28543,18 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: movq %rcx, %rax
-; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT: movl %eax, %ecx
-; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT: movq %rax, %rcx
-; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT: vmovq %xmm1, %rcx
-; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: movl %ecx, %eax
-; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm2
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm3
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
@@ -29316,7 +28566,31 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vmovq %xmm1, %rax
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vmovd %eax, %xmm7
+; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx
+; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm1
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm0
+; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: movq %rcx, %rax
+; NoVLX-NEXT: shrq $32, %rax
+; NoVLX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: shrq $48, %rcx
+; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm2, %ymm2
; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2
@@ -29383,77 +28657,83 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
@@ -29496,12 +28776,12 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1077:
+; NoVLX-NEXT: .Lcfi897:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1078:
+; NoVLX-NEXT: .Lcfi898:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1079:
+; NoVLX-NEXT: .Lcfi899:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -29513,8 +28793,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
@@ -29527,19 +28805,20 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -29547,6 +28826,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
@@ -29588,160 +28868,160 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; NoVLX-NEXT: vmovdqa (%rsi), %ymm4
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm5
-; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm3
-; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; NoVLX-NEXT: vpxor %ymm4, %ymm5, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
+; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm4
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2
+; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -29798,8 +29078,8 @@ define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29856,8 +29136,8 @@ define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29896,7 +29176,6 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -29908,13 +29187,14 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -29931,8 +29211,8 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29974,7 +29254,6 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -29986,13 +29265,14 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30009,8 +29289,8 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30073,8 +29353,8 @@ define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30116,7 +29396,6 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -30128,13 +29407,14 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30151,8 +29431,8 @@ define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30213,8 +29493,8 @@ define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30270,8 +29550,8 @@ define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30310,7 +29590,6 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -30322,13 +29601,14 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30344,8 +29624,8 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30387,7 +29667,6 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -30399,13 +29678,14 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30421,8 +29701,8 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30484,8 +29764,8 @@ define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30527,7 +29807,6 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -30539,13 +29818,14 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30561,8 +29841,8 @@ define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30605,12 +29885,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1080:
+; NoVLX-NEXT: .Lcfi900:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1081:
+; NoVLX-NEXT: .Lcfi901:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1082:
+; NoVLX-NEXT: .Lcfi902:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -30650,12 +29930,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1083:
+; NoVLX-NEXT: .Lcfi903:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1084:
+; NoVLX-NEXT: .Lcfi904:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1085:
+; NoVLX-NEXT: .Lcfi905:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -30698,16 +29978,15 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1086:
+; NoVLX-NEXT: .Lcfi906:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1087:
+; NoVLX-NEXT: .Lcfi907:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1088:
+; NoVLX-NEXT: .Lcfi908:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -30719,13 +29998,14 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -30763,17 +30043,16 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1089:
+; NoVLX-NEXT: .Lcfi909:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1090:
+; NoVLX-NEXT: .Lcfi910:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1091:
+; NoVLX-NEXT: .Lcfi911:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -30785,13 +30064,14 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -30831,12 +30111,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1092:
+; NoVLX-NEXT: .Lcfi912:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1093:
+; NoVLX-NEXT: .Lcfi913:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1094:
+; NoVLX-NEXT: .Lcfi914:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -30881,17 +30161,16 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1095:
+; NoVLX-NEXT: .Lcfi915:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1096:
+; NoVLX-NEXT: .Lcfi916:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1097:
+; NoVLX-NEXT: .Lcfi917:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -30903,13 +30182,14 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -30949,12 +30229,12 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1098:
+; NoVLX-NEXT: .Lcfi918:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1099:
+; NoVLX-NEXT: .Lcfi919:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1100:
+; NoVLX-NEXT: .Lcfi920:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -30963,8 +30243,8 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -31000,12 +30280,12 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1101:
+; NoVLX-NEXT: .Lcfi921:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1102:
+; NoVLX-NEXT: .Lcfi922:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1103:
+; NoVLX-NEXT: .Lcfi923:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -31015,8 +30295,8 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -31054,19 +30334,18 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1104:
+; NoVLX-NEXT: .Lcfi924:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1105:
+; NoVLX-NEXT: .Lcfi925:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1106:
+; NoVLX-NEXT: .Lcfi926:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -31079,13 +30358,14 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -31125,12 +30405,12 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1107:
+; NoVLX-NEXT: .Lcfi927:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1108:
+; NoVLX-NEXT: .Lcfi928:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1109:
+; NoVLX-NEXT: .Lcfi929:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -31138,7 +30418,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -31151,13 +30430,14 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -31199,12 +30479,12 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1110:
+; NoVLX-NEXT: .Lcfi930:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1111:
+; NoVLX-NEXT: .Lcfi931:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1112:
+; NoVLX-NEXT: .Lcfi932:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -31214,8 +30494,8 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -31255,12 +30535,12 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1113:
+; NoVLX-NEXT: .Lcfi933:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1114:
+; NoVLX-NEXT: .Lcfi934:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1115:
+; NoVLX-NEXT: .Lcfi935:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -31268,7 +30548,6 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -31281,13 +30560,14 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -31520,12 +30800,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1116:
+; NoVLX-NEXT: .Lcfi936:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1117:
+; NoVLX-NEXT: .Lcfi937:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1118:
+; NoVLX-NEXT: .Lcfi938:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -31595,12 +30875,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1119:
+; NoVLX-NEXT: .Lcfi939:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1120:
+; NoVLX-NEXT: .Lcfi940:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1121:
+; NoVLX-NEXT: .Lcfi941:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -31672,12 +30952,12 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1122:
+; NoVLX-NEXT: .Lcfi942:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1123:
+; NoVLX-NEXT: .Lcfi943:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1124:
+; NoVLX-NEXT: .Lcfi944:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -31752,12 +31032,12 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1125:
+; NoVLX-NEXT: .Lcfi945:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1126:
+; NoVLX-NEXT: .Lcfi946:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1127:
+; NoVLX-NEXT: .Lcfi947:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -31834,12 +31114,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1128:
+; NoVLX-NEXT: .Lcfi948:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1129:
+; NoVLX-NEXT: .Lcfi949:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1130:
+; NoVLX-NEXT: .Lcfi950:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -31913,12 +31193,12 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1131:
+; NoVLX-NEXT: .Lcfi951:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1132:
+; NoVLX-NEXT: .Lcfi952:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1133:
+; NoVLX-NEXT: .Lcfi953:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -31995,55 +31275,55 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1134:
+; NoVLX-NEXT: .Lcfi954:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1135:
+; NoVLX-NEXT: .Lcfi955:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1136:
+; NoVLX-NEXT: .Lcfi956:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32075,55 +31355,55 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1137:
+; NoVLX-NEXT: .Lcfi957:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1138:
+; NoVLX-NEXT: .Lcfi958:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1139:
+; NoVLX-NEXT: .Lcfi959:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32157,12 +31437,12 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1140:
+; NoVLX-NEXT: .Lcfi960:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1141:
+; NoVLX-NEXT: .Lcfi961:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1142:
+; NoVLX-NEXT: .Lcfi962:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -32171,43 +31451,43 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32242,12 +31522,12 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1143:
+; NoVLX-NEXT: .Lcfi963:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1144:
+; NoVLX-NEXT: .Lcfi964:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1145:
+; NoVLX-NEXT: .Lcfi965:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -32256,43 +31536,43 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32329,55 +31609,55 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1146:
+; NoVLX-NEXT: .Lcfi966:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1147:
+; NoVLX-NEXT: .Lcfi967:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1148:
+; NoVLX-NEXT: .Lcfi968:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32413,12 +31693,12 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1149:
+; NoVLX-NEXT: .Lcfi969:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1150:
+; NoVLX-NEXT: .Lcfi970:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1151:
+; NoVLX-NEXT: .Lcfi971:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -32427,43 +31707,43 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32500,93 +31780,78 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1152:
+; NoVLX-NEXT: .Lcfi972:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1153:
+; NoVLX-NEXT: .Lcfi973:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1154:
+; NoVLX-NEXT: .Lcfi974:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1155:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1156:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1157:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1158:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1159:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32595,12 +31860,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -32623,93 +31883,78 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1160:
+; NoVLX-NEXT: .Lcfi975:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1161:
+; NoVLX-NEXT: .Lcfi976:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1162:
+; NoVLX-NEXT: .Lcfi977:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1163:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1164:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1165:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1166:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1167:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32718,12 +31963,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -32748,94 +31988,79 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1168:
+; NoVLX-NEXT: .Lcfi978:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1169:
+; NoVLX-NEXT: .Lcfi979:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1170:
+; NoVLX-NEXT: .Lcfi980:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1171:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1172:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1173:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1174:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1175:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32844,12 +32069,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -32875,94 +32095,79 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1176:
+; NoVLX-NEXT: .Lcfi981:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1177:
+; NoVLX-NEXT: .Lcfi982:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1178:
+; NoVLX-NEXT: .Lcfi983:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1179:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1180:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1181:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1182:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1183:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32971,12 +32176,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -33004,94 +32204,79 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1184:
+; NoVLX-NEXT: .Lcfi984:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1185:
+; NoVLX-NEXT: .Lcfi985:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1186:
+; NoVLX-NEXT: .Lcfi986:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1187:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1188:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1189:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1190:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1191:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -33100,12 +32285,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -33132,30 +32312,15 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1192:
+; NoVLX-NEXT: .Lcfi987:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1193:
+; NoVLX-NEXT: .Lcfi988:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1194:
+; NoVLX-NEXT: .Lcfi989:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1195:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1196:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1197:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1198:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1199:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
@@ -33163,64 +32328,64 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -33229,12 +32394,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -33262,12 +32422,12 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1200:
+; NoVLX-NEXT: .Lcfi990:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1201:
+; NoVLX-NEXT: .Lcfi991:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1202:
+; NoVLX-NEXT: .Lcfi992:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -33276,21 +32436,17 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1203:
+; NoVLX-NEXT: .Lcfi993:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1204:
+; NoVLX-NEXT: .Lcfi994:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1205:
+; NoVLX-NEXT: .Lcfi995:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1206:
+; NoVLX-NEXT: .Lcfi996:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1207:
+; NoVLX-NEXT: .Lcfi997:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -33333,11 +32489,11 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33349,11 +32505,15 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33390,12 +32550,12 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1208:
+; NoVLX-NEXT: .Lcfi998:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1209:
+; NoVLX-NEXT: .Lcfi999:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1210:
+; NoVLX-NEXT: .Lcfi1000:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -33404,21 +32564,17 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1211:
+; NoVLX-NEXT: .Lcfi1001:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1212:
+; NoVLX-NEXT: .Lcfi1002:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1213:
+; NoVLX-NEXT: .Lcfi1003:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1214:
+; NoVLX-NEXT: .Lcfi1004:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1215:
+; NoVLX-NEXT: .Lcfi1005:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -33461,11 +32617,11 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33477,11 +32633,15 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33520,12 +32680,12 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1216:
+; NoVLX-NEXT: .Lcfi1006:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1217:
+; NoVLX-NEXT: .Lcfi1007:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1218:
+; NoVLX-NEXT: .Lcfi1008:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -33534,22 +32694,18 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1219:
+; NoVLX-NEXT: .Lcfi1009:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1220:
+; NoVLX-NEXT: .Lcfi1010:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1221:
+; NoVLX-NEXT: .Lcfi1011:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1222:
+; NoVLX-NEXT: .Lcfi1012:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1223:
+; NoVLX-NEXT: .Lcfi1013:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -33592,11 +32748,11 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33608,11 +32764,15 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33652,12 +32812,12 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1224:
+; NoVLX-NEXT: .Lcfi1014:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1225:
+; NoVLX-NEXT: .Lcfi1015:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1226:
+; NoVLX-NEXT: .Lcfi1016:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -33666,22 +32826,18 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1227:
+; NoVLX-NEXT: .Lcfi1017:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1228:
+; NoVLX-NEXT: .Lcfi1018:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1229:
+; NoVLX-NEXT: .Lcfi1019:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1230:
+; NoVLX-NEXT: .Lcfi1020:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1231:
+; NoVLX-NEXT: .Lcfi1021:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -33724,11 +32880,11 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33740,11 +32896,15 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33786,12 +32946,12 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1232:
+; NoVLX-NEXT: .Lcfi1022:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1233:
+; NoVLX-NEXT: .Lcfi1023:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1234:
+; NoVLX-NEXT: .Lcfi1024:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -33800,22 +32960,18 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1235:
+; NoVLX-NEXT: .Lcfi1025:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1236:
+; NoVLX-NEXT: .Lcfi1026:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1237:
+; NoVLX-NEXT: .Lcfi1027:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1238:
+; NoVLX-NEXT: .Lcfi1028:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1239:
+; NoVLX-NEXT: .Lcfi1029:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -33858,11 +33014,11 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33874,11 +33030,15 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33919,12 +33079,12 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1240:
+; NoVLX-NEXT: .Lcfi1030:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1241:
+; NoVLX-NEXT: .Lcfi1031:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1242:
+; NoVLX-NEXT: .Lcfi1032:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -33933,23 +33093,19 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1243:
+; NoVLX-NEXT: .Lcfi1033:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1244:
+; NoVLX-NEXT: .Lcfi1034:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1245:
+; NoVLX-NEXT: .Lcfi1035:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1246:
+; NoVLX-NEXT: .Lcfi1036:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1247:
+; NoVLX-NEXT: .Lcfi1037:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -33992,11 +33148,11 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -34008,11 +33164,15 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -34114,7 +33274,6 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -34122,9 +33281,10 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -34156,7 +33316,6 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -34164,9 +33323,10 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -34234,7 +33394,6 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -34242,9 +33401,10 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -34863,12 +34023,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1248:
+; NoVLX-NEXT: .Lcfi1038:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1249:
+; NoVLX-NEXT: .Lcfi1039:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1250:
+; NoVLX-NEXT: .Lcfi1040:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -34908,12 +34068,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1251:
+; NoVLX-NEXT: .Lcfi1041:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1252:
+; NoVLX-NEXT: .Lcfi1042:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1253:
+; NoVLX-NEXT: .Lcfi1043:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -34956,16 +34116,15 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1254:
+; NoVLX-NEXT: .Lcfi1044:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1255:
+; NoVLX-NEXT: .Lcfi1045:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1256:
+; NoVLX-NEXT: .Lcfi1046:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -34973,9 +34132,10 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -35013,17 +34173,16 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1257:
+; NoVLX-NEXT: .Lcfi1047:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1258:
+; NoVLX-NEXT: .Lcfi1048:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1259:
+; NoVLX-NEXT: .Lcfi1049:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -35031,9 +34190,10 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -35073,12 +34233,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1260:
+; NoVLX-NEXT: .Lcfi1050:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1261:
+; NoVLX-NEXT: .Lcfi1051:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1262:
+; NoVLX-NEXT: .Lcfi1052:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -35123,17 +34283,16 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1263:
+; NoVLX-NEXT: .Lcfi1053:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1264:
+; NoVLX-NEXT: .Lcfi1054:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1265:
+; NoVLX-NEXT: .Lcfi1055:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -35141,9 +34300,10 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -35183,12 +34343,12 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1266:
+; NoVLX-NEXT: .Lcfi1056:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1267:
+; NoVLX-NEXT: .Lcfi1057:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1268:
+; NoVLX-NEXT: .Lcfi1058:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35197,8 +34357,8 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -35234,12 +34394,12 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1269:
+; NoVLX-NEXT: .Lcfi1059:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1270:
+; NoVLX-NEXT: .Lcfi1060:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1271:
+; NoVLX-NEXT: .Lcfi1061:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35249,8 +34409,8 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -35288,12 +34448,12 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1272:
+; NoVLX-NEXT: .Lcfi1062:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1273:
+; NoVLX-NEXT: .Lcfi1063:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1274:
+; NoVLX-NEXT: .Lcfi1064:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35310,8 +34470,8 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -35351,12 +34511,12 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1275:
+; NoVLX-NEXT: .Lcfi1065:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1276:
+; NoVLX-NEXT: .Lcfi1066:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1277:
+; NoVLX-NEXT: .Lcfi1067:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35374,8 +34534,8 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -35417,12 +34577,12 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1278:
+; NoVLX-NEXT: .Lcfi1068:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1279:
+; NoVLX-NEXT: .Lcfi1069:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1280:
+; NoVLX-NEXT: .Lcfi1070:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35432,8 +34592,8 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -35473,12 +34633,12 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1281:
+; NoVLX-NEXT: .Lcfi1071:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1282:
+; NoVLX-NEXT: .Lcfi1072:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1283:
+; NoVLX-NEXT: .Lcfi1073:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35496,8 +34656,8 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -35560,8 +34720,8 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35620,8 +34780,8 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35664,7 +34824,6 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -35682,6 +34841,7 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -35699,8 +34859,8 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35746,7 +34906,6 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -35764,6 +34923,7 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -35781,8 +34941,8 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35847,8 +35007,8 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35894,7 +35054,6 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -35912,6 +35071,7 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -35929,8 +35089,8 @@ define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35993,8 +35153,8 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36052,8 +35212,8 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36096,7 +35256,6 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -36114,6 +35273,7 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -36130,8 +35290,8 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36177,7 +35337,6 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -36195,6 +35354,7 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -36211,8 +35371,8 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36276,8 +35436,8 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36323,7 +35483,6 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -36341,6 +35500,7 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -36357,8 +35517,8 @@ define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36402,12 +35562,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1284:
+; NoVLX-NEXT: .Lcfi1074:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1285:
+; NoVLX-NEXT: .Lcfi1075:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1286:
+; NoVLX-NEXT: .Lcfi1076:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36449,12 +35609,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1287:
+; NoVLX-NEXT: .Lcfi1077:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1288:
+; NoVLX-NEXT: .Lcfi1078:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1289:
+; NoVLX-NEXT: .Lcfi1079:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36499,19 +35659,18 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1290:
+; NoVLX-NEXT: .Lcfi1080:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1291:
+; NoVLX-NEXT: .Lcfi1081:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1292:
+; NoVLX-NEXT: .Lcfi1082:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -36529,6 +35688,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -36568,12 +35728,12 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1293:
+; NoVLX-NEXT: .Lcfi1083:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1294:
+; NoVLX-NEXT: .Lcfi1084:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1295:
+; NoVLX-NEXT: .Lcfi1085:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36581,7 +35741,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -36599,6 +35758,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -36640,12 +35800,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1296:
+; NoVLX-NEXT: .Lcfi1086:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1297:
+; NoVLX-NEXT: .Lcfi1087:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1298:
+; NoVLX-NEXT: .Lcfi1088:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36692,12 +35852,12 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1299:
+; NoVLX-NEXT: .Lcfi1089:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1300:
+; NoVLX-NEXT: .Lcfi1090:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1301:
+; NoVLX-NEXT: .Lcfi1091:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36705,7 +35865,6 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -36723,6 +35882,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -36764,12 +35924,12 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1302:
+; NoVLX-NEXT: .Lcfi1092:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1303:
+; NoVLX-NEXT: .Lcfi1093:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1304:
+; NoVLX-NEXT: .Lcfi1094:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -36779,8 +35939,8 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -36817,12 +35977,12 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1305:
+; NoVLX-NEXT: .Lcfi1095:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1306:
+; NoVLX-NEXT: .Lcfi1096:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1307:
+; NoVLX-NEXT: .Lcfi1097:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -36833,8 +35993,8 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -36873,12 +36033,12 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1308:
+; NoVLX-NEXT: .Lcfi1098:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1309:
+; NoVLX-NEXT: .Lcfi1099:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1310:
+; NoVLX-NEXT: .Lcfi1100:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -36888,7 +36048,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -36901,13 +36060,14 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -36948,12 +36108,12 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1311:
+; NoVLX-NEXT: .Lcfi1101:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1312:
+; NoVLX-NEXT: .Lcfi1102:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1313:
+; NoVLX-NEXT: .Lcfi1103:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -36964,7 +36124,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -36977,13 +36136,14 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -37026,12 +36186,12 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1314:
+; NoVLX-NEXT: .Lcfi1104:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1315:
+; NoVLX-NEXT: .Lcfi1105:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1316:
+; NoVLX-NEXT: .Lcfi1106:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37042,8 +36202,8 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -37084,12 +36244,12 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1317:
+; NoVLX-NEXT: .Lcfi1107:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1318:
+; NoVLX-NEXT: .Lcfi1108:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1319:
+; NoVLX-NEXT: .Lcfi1109:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37100,7 +36260,6 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -37113,13 +36272,14 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -37330,12 +36490,12 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1320:
+; NoVLX-NEXT: .Lcfi1110:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1321:
+; NoVLX-NEXT: .Lcfi1111:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1322:
+; NoVLX-NEXT: .Lcfi1112:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -37403,12 +36563,12 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1323:
+; NoVLX-NEXT: .Lcfi1113:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1324:
+; NoVLX-NEXT: .Lcfi1114:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1325:
+; NoVLX-NEXT: .Lcfi1115:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -37478,12 +36638,12 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1326:
+; NoVLX-NEXT: .Lcfi1116:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1327:
+; NoVLX-NEXT: .Lcfi1117:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1328:
+; NoVLX-NEXT: .Lcfi1118:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -37555,12 +36715,12 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1329:
+; NoVLX-NEXT: .Lcfi1119:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1330:
+; NoVLX-NEXT: .Lcfi1120:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1331:
+; NoVLX-NEXT: .Lcfi1121:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -37634,12 +36794,12 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1332:
+; NoVLX-NEXT: .Lcfi1122:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1333:
+; NoVLX-NEXT: .Lcfi1123:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1334:
+; NoVLX-NEXT: .Lcfi1124:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -37712,12 +36872,12 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1335:
+; NoVLX-NEXT: .Lcfi1125:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1336:
+; NoVLX-NEXT: .Lcfi1126:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1337:
+; NoVLX-NEXT: .Lcfi1127:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -37792,53 +36952,53 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1338:
+; NoVLX-NEXT: .Lcfi1128:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1339:
+; NoVLX-NEXT: .Lcfi1129:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1340:
+; NoVLX-NEXT: .Lcfi1130:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -37870,53 +37030,53 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1341:
+; NoVLX-NEXT: .Lcfi1131:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1342:
+; NoVLX-NEXT: .Lcfi1132:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1343:
+; NoVLX-NEXT: .Lcfi1133:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -37950,54 +37110,54 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1344:
+; NoVLX-NEXT: .Lcfi1134:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1345:
+; NoVLX-NEXT: .Lcfi1135:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1346:
+; NoVLX-NEXT: .Lcfi1136:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38032,54 +37192,54 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1347:
+; NoVLX-NEXT: .Lcfi1137:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1348:
+; NoVLX-NEXT: .Lcfi1138:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1349:
+; NoVLX-NEXT: .Lcfi1139:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38116,54 +37276,54 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1350:
+; NoVLX-NEXT: .Lcfi1140:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1351:
+; NoVLX-NEXT: .Lcfi1141:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1352:
+; NoVLX-NEXT: .Lcfi1142:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38199,55 +37359,55 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1353:
+; NoVLX-NEXT: .Lcfi1143:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1354:
+; NoVLX-NEXT: .Lcfi1144:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1355:
+; NoVLX-NEXT: .Lcfi1145:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38283,30 +37443,15 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1356:
+; NoVLX-NEXT: .Lcfi1146:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1357:
+; NoVLX-NEXT: .Lcfi1147:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1358:
+; NoVLX-NEXT: .Lcfi1148:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1359:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1360:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1361:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1362:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1363:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -38318,64 +37463,64 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38384,12 +37529,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -38411,30 +37551,15 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1364:
+; NoVLX-NEXT: .Lcfi1149:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1365:
+; NoVLX-NEXT: .Lcfi1150:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1366:
+; NoVLX-NEXT: .Lcfi1151:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1367:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1368:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1369:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1370:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1371:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
@@ -38446,64 +37571,64 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38512,12 +37637,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -38541,30 +37661,15 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1372:
+; NoVLX-NEXT: .Lcfi1152:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1373:
+; NoVLX-NEXT: .Lcfi1153:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1374:
+; NoVLX-NEXT: .Lcfi1154:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1375:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1376:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1377:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1378:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1379:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -38577,64 +37682,64 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38643,12 +37748,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -38673,30 +37773,15 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1380:
+; NoVLX-NEXT: .Lcfi1155:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1381:
+; NoVLX-NEXT: .Lcfi1156:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1382:
+; NoVLX-NEXT: .Lcfi1157:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1383:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1384:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1385:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1386:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1387:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
@@ -38709,64 +37794,64 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38775,12 +37860,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -38806,12 +37886,12 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1388:
+; NoVLX-NEXT: .Lcfi1158:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1389:
+; NoVLX-NEXT: .Lcfi1159:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1390:
+; NoVLX-NEXT: .Lcfi1160:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -38820,15 +37900,15 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1391:
+; NoVLX-NEXT: .Lcfi1161:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1392:
+; NoVLX-NEXT: .Lcfi1162:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1393:
+; NoVLX-NEXT: .Lcfi1163:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1394:
+; NoVLX-NEXT: .Lcfi1164:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1395:
+; NoVLX-NEXT: .Lcfi1165:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -38837,10 +37917,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -38883,11 +37959,11 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -38899,11 +37975,15 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38939,12 +38019,12 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1396:
+; NoVLX-NEXT: .Lcfi1166:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1397:
+; NoVLX-NEXT: .Lcfi1167:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1398:
+; NoVLX-NEXT: .Lcfi1168:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -38953,27 +38033,23 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1399:
+; NoVLX-NEXT: .Lcfi1169:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1400:
+; NoVLX-NEXT: .Lcfi1170:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1401:
+; NoVLX-NEXT: .Lcfi1171:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1402:
+; NoVLX-NEXT: .Lcfi1172:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1403:
+; NoVLX-NEXT: .Lcfi1173:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -39016,11 +38092,11 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39032,11 +38108,15 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -39074,12 +38154,12 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1404:
+; NoVLX-NEXT: .Lcfi1174:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1405:
+; NoVLX-NEXT: .Lcfi1175:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1406:
+; NoVLX-NEXT: .Lcfi1176:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -39088,15 +38168,15 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1407:
+; NoVLX-NEXT: .Lcfi1177:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1408:
+; NoVLX-NEXT: .Lcfi1178:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1409:
+; NoVLX-NEXT: .Lcfi1179:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1410:
+; NoVLX-NEXT: .Lcfi1180:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1411:
+; NoVLX-NEXT: .Lcfi1181:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -39106,10 +38186,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -39152,11 +38228,11 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39168,11 +38244,15 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -39211,12 +38291,12 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1412:
+; NoVLX-NEXT: .Lcfi1182:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1413:
+; NoVLX-NEXT: .Lcfi1183:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1414:
+; NoVLX-NEXT: .Lcfi1184:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -39225,28 +38305,24 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1415:
+; NoVLX-NEXT: .Lcfi1185:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1416:
+; NoVLX-NEXT: .Lcfi1186:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1417:
+; NoVLX-NEXT: .Lcfi1187:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1418:
+; NoVLX-NEXT: .Lcfi1188:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1419:
+; NoVLX-NEXT: .Lcfi1189:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -39289,11 +38365,11 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39305,11 +38381,15 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -39350,12 +38430,12 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1420:
+; NoVLX-NEXT: .Lcfi1190:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1421:
+; NoVLX-NEXT: .Lcfi1191:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1422:
+; NoVLX-NEXT: .Lcfi1192:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -39402,19 +38482,19 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1423:
+; NoVLX-NEXT: .Lcfi1193:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1424:
+; NoVLX-NEXT: .Lcfi1194:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1425:
+; NoVLX-NEXT: .Lcfi1195:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm2
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -39456,12 +38536,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1426:
+; NoVLX-NEXT: .Lcfi1196:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1427:
+; NoVLX-NEXT: .Lcfi1197:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1428:
+; NoVLX-NEXT: .Lcfi1198:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -39520,12 +38600,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1429:
+; NoVLX-NEXT: .Lcfi1199:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1430:
+; NoVLX-NEXT: .Lcfi1200:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1431:
+; NoVLX-NEXT: .Lcfi1201:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -39719,12 +38799,12 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1432:
+; NoVLX-NEXT: .Lcfi1202:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1433:
+; NoVLX-NEXT: .Lcfi1203:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1434:
+; NoVLX-NEXT: .Lcfi1204:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -39797,12 +38877,12 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1435:
+; NoVLX-NEXT: .Lcfi1205:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1436:
+; NoVLX-NEXT: .Lcfi1206:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1437:
+; NoVLX-NEXT: .Lcfi1207:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -39877,12 +38957,12 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1438:
+; NoVLX-NEXT: .Lcfi1208:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1439:
+; NoVLX-NEXT: .Lcfi1209:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1440:
+; NoVLX-NEXT: .Lcfi1210:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -39959,12 +39039,12 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1441:
+; NoVLX-NEXT: .Lcfi1211:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1442:
+; NoVLX-NEXT: .Lcfi1212:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1443:
+; NoVLX-NEXT: .Lcfi1213:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -40042,12 +39122,12 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1444:
+; NoVLX-NEXT: .Lcfi1214:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1445:
+; NoVLX-NEXT: .Lcfi1215:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1446:
+; NoVLX-NEXT: .Lcfi1216:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -40058,43 +39138,43 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40125,59 +39205,59 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1447:
+; NoVLX-NEXT: .Lcfi1217:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1448:
+; NoVLX-NEXT: .Lcfi1218:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1449:
+; NoVLX-NEXT: .Lcfi1219:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40210,12 +39290,12 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1450:
+; NoVLX-NEXT: .Lcfi1220:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1451:
+; NoVLX-NEXT: .Lcfi1221:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1452:
+; NoVLX-NEXT: .Lcfi1222:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -40227,43 +39307,43 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40297,60 +39377,60 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1453:
+; NoVLX-NEXT: .Lcfi1223:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1454:
+; NoVLX-NEXT: .Lcfi1224:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1455:
+; NoVLX-NEXT: .Lcfi1225:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40386,30 +39466,15 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1456:
+; NoVLX-NEXT: .Lcfi1226:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1457:
+; NoVLX-NEXT: .Lcfi1227:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1458:
+; NoVLX-NEXT: .Lcfi1228:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1459:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1460:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1461:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1462:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1463:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
@@ -40421,64 +39486,64 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40487,12 +39552,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -40515,30 +39575,15 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1464:
+; NoVLX-NEXT: .Lcfi1229:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1465:
+; NoVLX-NEXT: .Lcfi1230:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1466:
+; NoVLX-NEXT: .Lcfi1231:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1467:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1468:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1469:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1470:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1471:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
@@ -40550,64 +39595,64 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40616,12 +39661,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -40646,30 +39686,15 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1472:
+; NoVLX-NEXT: .Lcfi1232:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1473:
+; NoVLX-NEXT: .Lcfi1233:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1474:
+; NoVLX-NEXT: .Lcfi1234:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1475:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1476:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1477:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1478:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1479:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
@@ -40682,64 +39707,64 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40748,12 +39773,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -40779,30 +39799,15 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1480:
+; NoVLX-NEXT: .Lcfi1235:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1481:
+; NoVLX-NEXT: .Lcfi1236:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1482:
+; NoVLX-NEXT: .Lcfi1237:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1483:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1484:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1485:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1486:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1487:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
@@ -40815,64 +39820,64 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40881,12 +39886,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -40913,12 +39913,12 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1488:
+; NoVLX-NEXT: .Lcfi1238:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1489:
+; NoVLX-NEXT: .Lcfi1239:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1490:
+; NoVLX-NEXT: .Lcfi1240:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -40927,15 +39927,15 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1491:
+; NoVLX-NEXT: .Lcfi1241:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1492:
+; NoVLX-NEXT: .Lcfi1242:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1493:
+; NoVLX-NEXT: .Lcfi1243:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1494:
+; NoVLX-NEXT: .Lcfi1244:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1495:
+; NoVLX-NEXT: .Lcfi1245:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -40944,10 +39944,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -40990,11 +39986,11 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41006,11 +40002,15 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41047,12 +40047,12 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1496:
+; NoVLX-NEXT: .Lcfi1246:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1497:
+; NoVLX-NEXT: .Lcfi1247:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1498:
+; NoVLX-NEXT: .Lcfi1248:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -41061,27 +40061,23 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1499:
+; NoVLX-NEXT: .Lcfi1249:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1500:
+; NoVLX-NEXT: .Lcfi1250:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1501:
+; NoVLX-NEXT: .Lcfi1251:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1502:
+; NoVLX-NEXT: .Lcfi1252:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1503:
+; NoVLX-NEXT: .Lcfi1253:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm2
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -41124,11 +40120,11 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41140,11 +40136,15 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41183,12 +40183,12 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1504:
+; NoVLX-NEXT: .Lcfi1254:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1505:
+; NoVLX-NEXT: .Lcfi1255:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1506:
+; NoVLX-NEXT: .Lcfi1256:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -41197,15 +40197,15 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1507:
+; NoVLX-NEXT: .Lcfi1257:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1508:
+; NoVLX-NEXT: .Lcfi1258:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1509:
+; NoVLX-NEXT: .Lcfi1259:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1510:
+; NoVLX-NEXT: .Lcfi1260:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1511:
+; NoVLX-NEXT: .Lcfi1261:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -41215,10 +40215,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -41261,11 +40257,11 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41277,11 +40273,15 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41321,12 +40321,12 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1512:
+; NoVLX-NEXT: .Lcfi1262:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1513:
+; NoVLX-NEXT: .Lcfi1263:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1514:
+; NoVLX-NEXT: .Lcfi1264:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -41335,28 +40335,24 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1515:
+; NoVLX-NEXT: .Lcfi1265:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1516:
+; NoVLX-NEXT: .Lcfi1266:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1517:
+; NoVLX-NEXT: .Lcfi1267:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1518:
+; NoVLX-NEXT: .Lcfi1268:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1519:
+; NoVLX-NEXT: .Lcfi1269:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -41399,11 +40395,11 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41415,11 +40411,15 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41460,62 +40460,58 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1520:
+; NoVLX-NEXT: .Lcfi1270:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1521:
+; NoVLX-NEXT: .Lcfi1271:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1522:
+; NoVLX-NEXT: .Lcfi1272:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vmovq %xmm3, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm2, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm9
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm5
@@ -41523,79 +40519,72 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
; NoVLX-NEXT: vmovq %xmm6, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
-; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT: movl %ecx, %eax
-; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: movq %rcx, %rax
-; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm1, %rax
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %eax, %xmm6
+; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm2
@@ -41603,7 +40592,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
@@ -41611,111 +40600,122 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vmovq %xmm8, %rcx
-; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rax
+; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: shrl $16, %ecx
+; NoVLX-NEXT: vmovd %eax, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: shrq $32, %rcx
+; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rcx
+; NoVLX-NEXT: shrq $48, %rax
+; NoVLX-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
+; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm1, %ymm4, %ymm3
+; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3
-; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm3
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpxor %ymm1, %ymm3, %ymm2
+; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41816,177 +40816,177 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1523:
+; NoVLX-NEXT: .Lcfi1273:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1524:
+; NoVLX-NEXT: .Lcfi1274:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1525:
+; NoVLX-NEXT: .Lcfi1275:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT: vmovq %xmm2, %rax
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT: vmovq %xmm1, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
-; NoVLX-NEXT: vmovd %eax, %xmm1
+; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm1, %rdx
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
-; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: movl %edx, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: vpinsrw $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: movq %rdx, %rcx
+; NoVLX-NEXT: shrq $48, %rdx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %edx, %xmm1, %xmm1
; NoVLX-NEXT: vmovq %xmm0, %rcx
-; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: movl %ecx, %eax
-; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: movq %rcx, %rax
-; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT: movl %ecx, %edx
+; NoVLX-NEXT: shrl $16, %edx
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: movq %rcx, %rdx
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %eax, %edx
+; NoVLX-NEXT: shrl $16, %edx
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
-; NoVLX-NEXT: movl %eax, %ecx
-; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: movq %rax, %rcx
-; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm4, %rcx
-; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: movl %ecx, %eax
-; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: movq %rcx, %rax
-; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT: vmovq %xmm3, %rcx
+; NoVLX-NEXT: movq %rax, %rdx
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2
+; NoVLX-NEXT: movl %ecx, %edx
+; NoVLX-NEXT: shrl $16, %edx
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rdx
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rdx
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT: movl %edx, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: movq %rdx, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm1, %rcx
-; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: movl %ecx, %eax
-; NoVLX-NEXT: shrl $16, %eax
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT: vmovq %xmm0, %rcx
+; NoVLX-NEXT: shrq $48, %rdx
+; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: movl %ecx, %edx
+; NoVLX-NEXT: shrl $16, %edx
; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: movq %rcx, %rax
-; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: movq %rcx, %rdx
+; NoVLX-NEXT: shrq $32, %rdx
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm0, %rdx
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT: movl %eax, %ecx
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
+; NoVLX-NEXT: movl %edx, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: movq %rax, %rcx
+; NoVLX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: movq %rdx, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: shrq $48, %rdx
+; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm0, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor 32(%rdi), %ymm0, %ymm4
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
-; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpxor %ymm0, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -42089,12 +41089,12 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1526:
+; NoVLX-NEXT: .Lcfi1276:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1527:
+; NoVLX-NEXT: .Lcfi1277:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1528:
+; NoVLX-NEXT: .Lcfi1278:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -42105,17 +41105,12 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rdx
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
@@ -42123,9 +41118,10 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm4
@@ -42143,39 +41139,40 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm6, %rcx
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm9
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
@@ -42183,171 +41180,170 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm7, %rcx
+; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT: vmovq %xmm5, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm6
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT: vmovq %xmm6, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm7
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT: vpextrq $1, %xmm6, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vmovq %xmm8, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT: vmovq %xmm7, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm7, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vmovd %ecx, %xmm2
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm3
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8
+; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm2
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm1
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm4
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm5, %ymm6, %ymm3
-; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm5
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm4, %ymm5, %ymm5
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm5, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vmovd %ecx, %xmm1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
-; NoVLX-NEXT: vpxor %ymm5, %ymm8, %ymm2
-; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2
+; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
@@ -42406,20 +41402,24 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpand %xmm1, %xmm2, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -42457,12 +41457,12 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1529:
+; NoVLX-NEXT: .Lcfi1279:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1530:
+; NoVLX-NEXT: .Lcfi1280:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1531:
+; NoVLX-NEXT: .Lcfi1281:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -42474,8 +41474,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
@@ -42488,221 +41486,223 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT: vmovq %xmm2, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm3
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpextrq $1, %xmm2, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm6
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm3
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm2
-; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2
-; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm5
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm5, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor (%rsi), %ymm3, %ymm4
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm4, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm2
+; NoVLX-NEXT: vmovd %eax, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT: vpxor 32(%rsi), %ymm4, %ymm4
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpxor 32(%rsi), %ymm3, %ymm4
+; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -42760,8 +41760,8 @@ define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -42799,9 +41799,9 @@ define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -42818,8 +41818,8 @@ define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -42861,7 +41861,6 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -42873,13 +41872,14 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -42896,8 +41896,8 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -42941,7 +41941,6 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -42953,13 +41952,14 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -42976,8 +41976,8 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -43040,8 +42040,8 @@ define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -43085,7 +42085,6 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -43097,13 +42096,14 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43120,8 +42120,8 @@ define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -43183,8 +42183,8 @@ define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43222,9 +42222,9 @@ define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43240,8 +42240,8 @@ define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43283,7 +42283,6 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -43295,13 +42294,14 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43317,8 +42317,8 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43362,7 +42362,6 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -43374,13 +42373,14 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43396,8 +42396,8 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43459,8 +42459,8 @@ define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43504,7 +42504,6 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -43516,13 +42515,14 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -43538,8 +42538,8 @@ define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -43582,12 +42582,12 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1532:
+; NoVLX-NEXT: .Lcfi1282:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1533:
+; NoVLX-NEXT: .Lcfi1283:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1534:
+; NoVLX-NEXT: .Lcfi1284:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -43628,19 +42628,19 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1535:
+; NoVLX-NEXT: .Lcfi1285:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1536:
+; NoVLX-NEXT: .Lcfi1286:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1537:
+; NoVLX-NEXT: .Lcfi1287:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -43676,19 +42676,18 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1538:
+; NoVLX-NEXT: .Lcfi1288:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1539:
+; NoVLX-NEXT: .Lcfi1289:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1540:
+; NoVLX-NEXT: .Lcfi1290:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -43700,13 +42699,14 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -43744,19 +42744,18 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1541:
+; NoVLX-NEXT: .Lcfi1291:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1542:
+; NoVLX-NEXT: .Lcfi1292:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1543:
+; NoVLX-NEXT: .Lcfi1293:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -43768,13 +42767,14 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -43813,12 +42813,12 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1544:
+; NoVLX-NEXT: .Lcfi1294:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1545:
+; NoVLX-NEXT: .Lcfi1295:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1546:
+; NoVLX-NEXT: .Lcfi1296:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -43863,12 +42863,12 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1547:
+; NoVLX-NEXT: .Lcfi1297:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1548:
+; NoVLX-NEXT: .Lcfi1298:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1549:
+; NoVLX-NEXT: .Lcfi1299:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -43876,7 +42876,6 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -43888,13 +42887,14 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -43934,12 +42934,12 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1550:
+; NoVLX-NEXT: .Lcfi1300:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1551:
+; NoVLX-NEXT: .Lcfi1301:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1552:
+; NoVLX-NEXT: .Lcfi1302:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -43949,8 +42949,8 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -43986,12 +42986,12 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1553:
+; NoVLX-NEXT: .Lcfi1303:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1554:
+; NoVLX-NEXT: .Lcfi1304:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1555:
+; NoVLX-NEXT: .Lcfi1305:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44001,8 +43001,8 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -44040,12 +43040,12 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1556:
+; NoVLX-NEXT: .Lcfi1306:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1557:
+; NoVLX-NEXT: .Lcfi1307:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1558:
+; NoVLX-NEXT: .Lcfi1308:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44055,7 +43055,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -44068,13 +43067,14 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -44114,12 +43114,12 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1559:
+; NoVLX-NEXT: .Lcfi1309:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1560:
+; NoVLX-NEXT: .Lcfi1310:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1561:
+; NoVLX-NEXT: .Lcfi1311:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44129,7 +43129,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -44142,13 +43141,14 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -44189,12 +43189,12 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1562:
+; NoVLX-NEXT: .Lcfi1312:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1563:
+; NoVLX-NEXT: .Lcfi1313:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1564:
+; NoVLX-NEXT: .Lcfi1314:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44205,8 +43205,8 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -44245,12 +43245,12 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1565:
+; NoVLX-NEXT: .Lcfi1315:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1566:
+; NoVLX-NEXT: .Lcfi1316:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1567:
+; NoVLX-NEXT: .Lcfi1317:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44261,7 +43261,6 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -44274,13 +43273,14 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -44511,12 +43511,12 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1568:
+; NoVLX-NEXT: .Lcfi1318:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1569:
+; NoVLX-NEXT: .Lcfi1319:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1570:
+; NoVLX-NEXT: .Lcfi1320:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44586,12 +43586,12 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1571:
+; NoVLX-NEXT: .Lcfi1321:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1572:
+; NoVLX-NEXT: .Lcfi1322:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1573:
+; NoVLX-NEXT: .Lcfi1323:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44663,12 +43663,12 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1574:
+; NoVLX-NEXT: .Lcfi1324:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1575:
+; NoVLX-NEXT: .Lcfi1325:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1576:
+; NoVLX-NEXT: .Lcfi1326:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44743,12 +43743,12 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1577:
+; NoVLX-NEXT: .Lcfi1327:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1578:
+; NoVLX-NEXT: .Lcfi1328:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1579:
+; NoVLX-NEXT: .Lcfi1329:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44824,12 +43824,12 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1580:
+; NoVLX-NEXT: .Lcfi1330:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1581:
+; NoVLX-NEXT: .Lcfi1331:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1582:
+; NoVLX-NEXT: .Lcfi1332:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44902,12 +43902,12 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1583:
+; NoVLX-NEXT: .Lcfi1333:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1584:
+; NoVLX-NEXT: .Lcfi1334:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1585:
+; NoVLX-NEXT: .Lcfi1335:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44984,55 +43984,55 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1586:
+; NoVLX-NEXT: .Lcfi1336:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1587:
+; NoVLX-NEXT: .Lcfi1337:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1588:
+; NoVLX-NEXT: .Lcfi1338:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45064,55 +44064,55 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1589:
+; NoVLX-NEXT: .Lcfi1339:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1590:
+; NoVLX-NEXT: .Lcfi1340:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1591:
+; NoVLX-NEXT: .Lcfi1341:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45146,12 +44146,12 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1592:
+; NoVLX-NEXT: .Lcfi1342:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1593:
+; NoVLX-NEXT: .Lcfi1343:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1594:
+; NoVLX-NEXT: .Lcfi1344:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -45160,43 +44160,43 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45231,12 +44231,12 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1595:
+; NoVLX-NEXT: .Lcfi1345:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1596:
+; NoVLX-NEXT: .Lcfi1346:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1597:
+; NoVLX-NEXT: .Lcfi1347:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -45245,43 +44245,43 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45317,55 +44317,55 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1598:
+; NoVLX-NEXT: .Lcfi1348:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1599:
+; NoVLX-NEXT: .Lcfi1349:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1600:
+; NoVLX-NEXT: .Lcfi1350:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45400,12 +44400,12 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1601:
+; NoVLX-NEXT: .Lcfi1351:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1602:
+; NoVLX-NEXT: .Lcfi1352:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1603:
+; NoVLX-NEXT: .Lcfi1353:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -45414,43 +44414,43 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45487,93 +44487,78 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1604:
+; NoVLX-NEXT: .Lcfi1354:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1605:
+; NoVLX-NEXT: .Lcfi1355:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1606:
+; NoVLX-NEXT: .Lcfi1356:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1607:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1608:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1609:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1610:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1611:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45582,12 +44567,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -45610,93 +44590,78 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1612:
+; NoVLX-NEXT: .Lcfi1357:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1613:
+; NoVLX-NEXT: .Lcfi1358:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1614:
+; NoVLX-NEXT: .Lcfi1359:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1615:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1616:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1617:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1618:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1619:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45705,12 +44670,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -45735,94 +44695,79 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1620:
+; NoVLX-NEXT: .Lcfi1360:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1621:
+; NoVLX-NEXT: .Lcfi1361:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1622:
+; NoVLX-NEXT: .Lcfi1362:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1623:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1624:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1625:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1626:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1627:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45831,12 +44776,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -45862,94 +44802,79 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1628:
+; NoVLX-NEXT: .Lcfi1363:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1629:
+; NoVLX-NEXT: .Lcfi1364:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1630:
+; NoVLX-NEXT: .Lcfi1365:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1631:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1632:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1633:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1634:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1635:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -45958,12 +44883,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -45990,93 +44910,78 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1636:
+; NoVLX-NEXT: .Lcfi1366:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1637:
+; NoVLX-NEXT: .Lcfi1367:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1638:
+; NoVLX-NEXT: .Lcfi1368:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1639:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1640:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1641:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1642:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1643:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -46085,12 +44990,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -46116,94 +45016,79 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1644:
+; NoVLX-NEXT: .Lcfi1369:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1645:
+; NoVLX-NEXT: .Lcfi1370:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1646:
+; NoVLX-NEXT: .Lcfi1371:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1647:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1648:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1649:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1650:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1651:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -46212,12 +45097,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -46245,12 +45125,12 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1652:
+; NoVLX-NEXT: .Lcfi1372:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1653:
+; NoVLX-NEXT: .Lcfi1373:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1654:
+; NoVLX-NEXT: .Lcfi1374:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -46259,21 +45139,17 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1655:
+; NoVLX-NEXT: .Lcfi1375:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1656:
+; NoVLX-NEXT: .Lcfi1376:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1657:
+; NoVLX-NEXT: .Lcfi1377:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1658:
+; NoVLX-NEXT: .Lcfi1378:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1659:
+; NoVLX-NEXT: .Lcfi1379:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -46316,11 +45192,11 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46332,11 +45208,15 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46373,12 +45253,12 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1660:
+; NoVLX-NEXT: .Lcfi1380:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1661:
+; NoVLX-NEXT: .Lcfi1381:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1662:
+; NoVLX-NEXT: .Lcfi1382:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -46387,21 +45267,17 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1663:
+; NoVLX-NEXT: .Lcfi1383:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1664:
+; NoVLX-NEXT: .Lcfi1384:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1665:
+; NoVLX-NEXT: .Lcfi1385:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1666:
+; NoVLX-NEXT: .Lcfi1386:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1667:
+; NoVLX-NEXT: .Lcfi1387:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -46444,11 +45320,11 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46460,11 +45336,15 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46503,12 +45383,12 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1668:
+; NoVLX-NEXT: .Lcfi1388:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1669:
+; NoVLX-NEXT: .Lcfi1389:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1670:
+; NoVLX-NEXT: .Lcfi1390:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -46517,22 +45397,18 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1671:
+; NoVLX-NEXT: .Lcfi1391:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1672:
+; NoVLX-NEXT: .Lcfi1392:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1673:
+; NoVLX-NEXT: .Lcfi1393:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1674:
+; NoVLX-NEXT: .Lcfi1394:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1675:
+; NoVLX-NEXT: .Lcfi1395:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -46575,11 +45451,11 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46591,11 +45467,15 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46635,12 +45515,12 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1676:
+; NoVLX-NEXT: .Lcfi1396:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1677:
+; NoVLX-NEXT: .Lcfi1397:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1678:
+; NoVLX-NEXT: .Lcfi1398:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -46649,22 +45529,18 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1679:
+; NoVLX-NEXT: .Lcfi1399:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1680:
+; NoVLX-NEXT: .Lcfi1400:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1681:
+; NoVLX-NEXT: .Lcfi1401:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1682:
+; NoVLX-NEXT: .Lcfi1402:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1683:
+; NoVLX-NEXT: .Lcfi1403:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -46707,11 +45583,11 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46723,11 +45599,15 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46768,12 +45648,12 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1684:
+; NoVLX-NEXT: .Lcfi1404:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1685:
+; NoVLX-NEXT: .Lcfi1405:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1686:
+; NoVLX-NEXT: .Lcfi1406:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -46782,21 +45662,17 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1687:
+; NoVLX-NEXT: .Lcfi1407:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1688:
+; NoVLX-NEXT: .Lcfi1408:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1689:
+; NoVLX-NEXT: .Lcfi1409:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1690:
+; NoVLX-NEXT: .Lcfi1410:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1691:
+; NoVLX-NEXT: .Lcfi1411:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -46839,11 +45715,11 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46855,11 +45731,15 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46899,12 +45779,12 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1692:
+; NoVLX-NEXT: .Lcfi1412:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1693:
+; NoVLX-NEXT: .Lcfi1413:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1694:
+; NoVLX-NEXT: .Lcfi1414:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -46913,22 +45793,18 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1695:
+; NoVLX-NEXT: .Lcfi1415:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1696:
+; NoVLX-NEXT: .Lcfi1416:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1697:
+; NoVLX-NEXT: .Lcfi1417:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1698:
+; NoVLX-NEXT: .Lcfi1418:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1699:
+; NoVLX-NEXT: .Lcfi1419:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -46971,11 +45847,11 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -46987,11 +45863,15 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47063,9 +45943,9 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47097,7 +45977,6 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -47105,9 +45984,10 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47141,7 +46021,6 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -47149,9 +46028,10 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47221,7 +46101,6 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -47229,9 +46108,10 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47407,9 +46287,9 @@ define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -47708,9 +46588,9 @@ define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -47866,12 +46746,12 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1700:
+; NoVLX-NEXT: .Lcfi1420:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1701:
+; NoVLX-NEXT: .Lcfi1421:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1702:
+; NoVLX-NEXT: .Lcfi1422:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -47912,19 +46792,19 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1703:
+; NoVLX-NEXT: .Lcfi1423:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1704:
+; NoVLX-NEXT: .Lcfi1424:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1705:
+; NoVLX-NEXT: .Lcfi1425:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -47960,19 +46840,18 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1706:
+; NoVLX-NEXT: .Lcfi1426:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1707:
+; NoVLX-NEXT: .Lcfi1427:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1708:
+; NoVLX-NEXT: .Lcfi1428:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -47980,9 +46859,10 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -48020,19 +46900,18 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1709:
+; NoVLX-NEXT: .Lcfi1429:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1710:
+; NoVLX-NEXT: .Lcfi1430:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1711:
+; NoVLX-NEXT: .Lcfi1431:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -48040,9 +46919,10 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -48081,12 +46961,12 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1712:
+; NoVLX-NEXT: .Lcfi1432:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1713:
+; NoVLX-NEXT: .Lcfi1433:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1714:
+; NoVLX-NEXT: .Lcfi1434:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48131,12 +47011,12 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1715:
+; NoVLX-NEXT: .Lcfi1435:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1716:
+; NoVLX-NEXT: .Lcfi1436:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1717:
+; NoVLX-NEXT: .Lcfi1437:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48144,7 +47024,6 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -48152,9 +47031,10 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -48194,12 +47074,12 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1718:
+; NoVLX-NEXT: .Lcfi1438:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1719:
+; NoVLX-NEXT: .Lcfi1439:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1720:
+; NoVLX-NEXT: .Lcfi1440:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48209,8 +47089,8 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -48246,12 +47126,12 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1721:
+; NoVLX-NEXT: .Lcfi1441:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1722:
+; NoVLX-NEXT: .Lcfi1442:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1723:
+; NoVLX-NEXT: .Lcfi1443:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48261,8 +47141,8 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -48300,12 +47180,12 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1724:
+; NoVLX-NEXT: .Lcfi1444:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1725:
+; NoVLX-NEXT: .Lcfi1445:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1726:
+; NoVLX-NEXT: .Lcfi1446:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48325,8 +47205,8 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -48366,12 +47246,12 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1727:
+; NoVLX-NEXT: .Lcfi1447:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1728:
+; NoVLX-NEXT: .Lcfi1448:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1729:
+; NoVLX-NEXT: .Lcfi1449:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48391,8 +47271,8 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -48433,12 +47313,12 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1730:
+; NoVLX-NEXT: .Lcfi1450:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1731:
+; NoVLX-NEXT: .Lcfi1451:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1732:
+; NoVLX-NEXT: .Lcfi1452:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48449,8 +47329,8 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -48489,12 +47369,12 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1733:
+; NoVLX-NEXT: .Lcfi1453:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1734:
+; NoVLX-NEXT: .Lcfi1454:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1735:
+; NoVLX-NEXT: .Lcfi1455:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48515,8 +47395,8 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -48580,8 +47460,8 @@ define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48640,8 +47520,8 @@ define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48685,7 +47565,6 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -48703,6 +47582,7 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -48720,8 +47600,8 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48764,10 +47644,9 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -48785,6 +47664,7 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -48802,8 +47682,8 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48868,8 +47748,8 @@ define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -48915,7 +47795,6 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -48933,6 +47812,7 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -48950,8 +47830,8 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -49015,8 +47895,8 @@ define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49074,8 +47954,8 @@ define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49119,7 +47999,6 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -49137,6 +48016,7 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -49153,8 +48033,8 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49197,10 +48077,9 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -49218,6 +48097,7 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -49234,8 +48114,8 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49299,8 +48179,8 @@ define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49346,7 +48226,6 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -49364,6 +48243,7 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -49380,8 +48260,8 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -49425,12 +48305,12 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1736:
+; NoVLX-NEXT: .Lcfi1456:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1737:
+; NoVLX-NEXT: .Lcfi1457:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1738:
+; NoVLX-NEXT: .Lcfi1458:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49473,12 +48353,12 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1739:
+; NoVLX-NEXT: .Lcfi1459:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1740:
+; NoVLX-NEXT: .Lcfi1460:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1741:
+; NoVLX-NEXT: .Lcfi1461:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49523,12 +48403,12 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1742:
+; NoVLX-NEXT: .Lcfi1462:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1743:
+; NoVLX-NEXT: .Lcfi1463:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1744:
+; NoVLX-NEXT: .Lcfi1464:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49536,7 +48416,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -49554,6 +48433,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -49593,20 +48473,19 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1745:
+; NoVLX-NEXT: .Lcfi1465:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1746:
+; NoVLX-NEXT: .Lcfi1466:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1747:
+; NoVLX-NEXT: .Lcfi1467:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -49624,6 +48503,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -49664,12 +48544,12 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1748:
+; NoVLX-NEXT: .Lcfi1468:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1749:
+; NoVLX-NEXT: .Lcfi1469:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1750:
+; NoVLX-NEXT: .Lcfi1470:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49716,12 +48596,12 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1751:
+; NoVLX-NEXT: .Lcfi1471:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1752:
+; NoVLX-NEXT: .Lcfi1472:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1753:
+; NoVLX-NEXT: .Lcfi1473:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49730,7 +48610,6 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
@@ -49748,6 +48627,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -49789,12 +48669,12 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1754:
+; NoVLX-NEXT: .Lcfi1474:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1755:
+; NoVLX-NEXT: .Lcfi1475:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1756:
+; NoVLX-NEXT: .Lcfi1476:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -49805,8 +48685,8 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -49843,12 +48723,12 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1757:
+; NoVLX-NEXT: .Lcfi1477:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1758:
+; NoVLX-NEXT: .Lcfi1478:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1759:
+; NoVLX-NEXT: .Lcfi1479:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -49859,8 +48739,8 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -49899,12 +48779,12 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1760:
+; NoVLX-NEXT: .Lcfi1480:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1761:
+; NoVLX-NEXT: .Lcfi1481:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1762:
+; NoVLX-NEXT: .Lcfi1482:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -49915,7 +48795,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -49928,13 +48807,14 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -49975,12 +48855,12 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1763:
+; NoVLX-NEXT: .Lcfi1483:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1764:
+; NoVLX-NEXT: .Lcfi1484:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1765:
+; NoVLX-NEXT: .Lcfi1485:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -49991,7 +48871,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -50004,13 +48883,14 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -50052,12 +48932,12 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1766:
+; NoVLX-NEXT: .Lcfi1486:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1767:
+; NoVLX-NEXT: .Lcfi1487:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1768:
+; NoVLX-NEXT: .Lcfi1488:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50069,8 +48949,8 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -50110,12 +48990,12 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1769:
+; NoVLX-NEXT: .Lcfi1489:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1770:
+; NoVLX-NEXT: .Lcfi1490:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1771:
+; NoVLX-NEXT: .Lcfi1491:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50127,7 +49007,6 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kshiftlw $13, %k0, %k2
; NoVLX-NEXT: kshiftrw $15, %k2, %k2
; NoVLX-NEXT: kshiftlw $15, %k0, %k3
@@ -50140,13 +49019,14 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; NoVLX-NEXT: kmovw %k2, %eax
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $15, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -50353,12 +49233,12 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1772:
+; NoVLX-NEXT: .Lcfi1492:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1773:
+; NoVLX-NEXT: .Lcfi1493:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1774:
+; NoVLX-NEXT: .Lcfi1494:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50426,12 +49306,12 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1775:
+; NoVLX-NEXT: .Lcfi1495:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1776:
+; NoVLX-NEXT: .Lcfi1496:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1777:
+; NoVLX-NEXT: .Lcfi1497:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50501,12 +49381,12 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1778:
+; NoVLX-NEXT: .Lcfi1498:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1779:
+; NoVLX-NEXT: .Lcfi1499:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1780:
+; NoVLX-NEXT: .Lcfi1500:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50578,12 +49458,12 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1781:
+; NoVLX-NEXT: .Lcfi1501:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1782:
+; NoVLX-NEXT: .Lcfi1502:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1783:
+; NoVLX-NEXT: .Lcfi1503:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50656,12 +49536,12 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1784:
+; NoVLX-NEXT: .Lcfi1504:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1785:
+; NoVLX-NEXT: .Lcfi1505:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1786:
+; NoVLX-NEXT: .Lcfi1506:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50732,12 +49612,12 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1787:
+; NoVLX-NEXT: .Lcfi1507:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1788:
+; NoVLX-NEXT: .Lcfi1508:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1789:
+; NoVLX-NEXT: .Lcfi1509:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50811,53 +49691,53 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1790:
+; NoVLX-NEXT: .Lcfi1510:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1791:
+; NoVLX-NEXT: .Lcfi1511:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1792:
+; NoVLX-NEXT: .Lcfi1512:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -50889,53 +49769,53 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1793:
+; NoVLX-NEXT: .Lcfi1513:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1794:
+; NoVLX-NEXT: .Lcfi1514:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1795:
+; NoVLX-NEXT: .Lcfi1515:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -50969,54 +49849,54 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1796:
+; NoVLX-NEXT: .Lcfi1516:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1797:
+; NoVLX-NEXT: .Lcfi1517:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1798:
+; NoVLX-NEXT: .Lcfi1518:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51051,54 +49931,54 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1799:
+; NoVLX-NEXT: .Lcfi1519:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1800:
+; NoVLX-NEXT: .Lcfi1520:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1801:
+; NoVLX-NEXT: .Lcfi1521:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51134,53 +50014,53 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1802:
+; NoVLX-NEXT: .Lcfi1522:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1803:
+; NoVLX-NEXT: .Lcfi1523:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1804:
+; NoVLX-NEXT: .Lcfi1524:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51215,54 +50095,54 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1805:
+; NoVLX-NEXT: .Lcfi1525:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1806:
+; NoVLX-NEXT: .Lcfi1526:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1807:
+; NoVLX-NEXT: .Lcfi1527:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51316,8 +50196,8 @@ define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51371,8 +50251,8 @@ define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51428,8 +50308,8 @@ define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %_
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51493,8 +50373,8 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51539,8 +50419,8 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -51558,8 +50438,8 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51625,8 +50505,8 @@ define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -51685,8 +50565,8 @@ define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51739,8 +50619,8 @@ define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51795,8 +50675,8 @@ define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51859,8 +50739,8 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51905,8 +50785,8 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
@@ -51923,8 +50803,8 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -51989,8 +50869,8 @@ define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -52033,12 +50913,12 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1808:
+; NoVLX-NEXT: .Lcfi1528:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1809:
+; NoVLX-NEXT: .Lcfi1529:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1810:
+; NoVLX-NEXT: .Lcfi1530:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -52076,12 +50956,12 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1811:
+; NoVLX-NEXT: .Lcfi1531:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1812:
+; NoVLX-NEXT: .Lcfi1532:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1813:
+; NoVLX-NEXT: .Lcfi1533:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -52120,12 +51000,12 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1814:
+; NoVLX-NEXT: .Lcfi1534:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1815:
+; NoVLX-NEXT: .Lcfi1535:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1816:
+; NoVLX-NEXT: .Lcfi1536:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -52168,12 +51048,12 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1817:
+; NoVLX-NEXT: .Lcfi1537:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1818:
+; NoVLX-NEXT: .Lcfi1538:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1819:
+; NoVLX-NEXT: .Lcfi1539:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -52221,12 +51101,12 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1820:
+; NoVLX-NEXT: .Lcfi1540:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1821:
+; NoVLX-NEXT: .Lcfi1541:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1822:
+; NoVLX-NEXT: .Lcfi1542:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -52234,8 +51114,8 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpmovqd %zmm1, %ymm1
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -52275,12 +51155,12 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1823:
+; NoVLX-NEXT: .Lcfi1543:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1824:
+; NoVLX-NEXT: .Lcfi1544:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1825:
+; NoVLX-NEXT: .Lcfi1545:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -52331,20 +51211,20 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1826:
+; NoVLX-NEXT: .Lcfi1546:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1827:
+; NoVLX-NEXT: .Lcfi1547:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1828:
+; NoVLX-NEXT: .Lcfi1548:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -52380,20 +51260,20 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1829:
+; NoVLX-NEXT: .Lcfi1549:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1830:
+; NoVLX-NEXT: .Lcfi1550:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1831:
+; NoVLX-NEXT: .Lcfi1551:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -52430,12 +51310,12 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1832:
+; NoVLX-NEXT: .Lcfi1552:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1833:
+; NoVLX-NEXT: .Lcfi1553:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1834:
+; NoVLX-NEXT: .Lcfi1554:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -52443,8 +51323,8 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -52484,12 +51364,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1835:
+; NoVLX-NEXT: .Lcfi1555:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1836:
+; NoVLX-NEXT: .Lcfi1556:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1837:
+; NoVLX-NEXT: .Lcfi1557:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -52502,8 +51382,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -52543,12 +51423,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1838:
+; NoVLX-NEXT: .Lcfi1558:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1839:
+; NoVLX-NEXT: .Lcfi1559:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1840:
+; NoVLX-NEXT: .Lcfi1560:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -52561,8 +51441,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -52603,12 +51483,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1841:
+; NoVLX-NEXT: .Lcfi1561:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1842:
+; NoVLX-NEXT: .Lcfi1562:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1843:
+; NoVLX-NEXT: .Lcfi1563:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -52622,8 +51502,8 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -52854,12 +51734,12 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1844:
+; NoVLX-NEXT: .Lcfi1564:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1845:
+; NoVLX-NEXT: .Lcfi1565:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1846:
+; NoVLX-NEXT: .Lcfi1566:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -52929,12 +51809,12 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1847:
+; NoVLX-NEXT: .Lcfi1567:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1848:
+; NoVLX-NEXT: .Lcfi1568:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1849:
+; NoVLX-NEXT: .Lcfi1569:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -53005,12 +51885,12 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1850:
+; NoVLX-NEXT: .Lcfi1570:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1851:
+; NoVLX-NEXT: .Lcfi1571:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1852:
+; NoVLX-NEXT: .Lcfi1572:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -53083,12 +51963,12 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1853:
+; NoVLX-NEXT: .Lcfi1573:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1854:
+; NoVLX-NEXT: .Lcfi1574:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1855:
+; NoVLX-NEXT: .Lcfi1575:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -53163,12 +52043,12 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1856:
+; NoVLX-NEXT: .Lcfi1576:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1857:
+; NoVLX-NEXT: .Lcfi1577:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1858:
+; NoVLX-NEXT: .Lcfi1578:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -53244,12 +52124,12 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1859:
+; NoVLX-NEXT: .Lcfi1579:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1860:
+; NoVLX-NEXT: .Lcfi1580:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1861:
+; NoVLX-NEXT: .Lcfi1581:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -53327,55 +52207,55 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1862:
+; NoVLX-NEXT: .Lcfi1582:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1863:
+; NoVLX-NEXT: .Lcfi1583:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1864:
+; NoVLX-NEXT: .Lcfi1584:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53407,55 +52287,55 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1865:
+; NoVLX-NEXT: .Lcfi1585:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1866:
+; NoVLX-NEXT: .Lcfi1586:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1867:
+; NoVLX-NEXT: .Lcfi1587:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53488,55 +52368,55 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1868:
+; NoVLX-NEXT: .Lcfi1588:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1869:
+; NoVLX-NEXT: .Lcfi1589:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1870:
+; NoVLX-NEXT: .Lcfi1590:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53571,12 +52451,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1871:
+; NoVLX-NEXT: .Lcfi1591:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1872:
+; NoVLX-NEXT: .Lcfi1592:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1873:
+; NoVLX-NEXT: .Lcfi1593:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -53585,43 +52465,43 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53656,12 +52536,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1874:
+; NoVLX-NEXT: .Lcfi1594:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1875:
+; NoVLX-NEXT: .Lcfi1595:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1876:
+; NoVLX-NEXT: .Lcfi1596:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -53670,43 +52550,43 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53742,12 +52622,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1877:
+; NoVLX-NEXT: .Lcfi1597:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1878:
+; NoVLX-NEXT: .Lcfi1598:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1879:
+; NoVLX-NEXT: .Lcfi1599:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -53756,43 +52636,43 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53830,93 +52710,78 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1880:
+; NoVLX-NEXT: .Lcfi1600:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1881:
+; NoVLX-NEXT: .Lcfi1601:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1882:
+; NoVLX-NEXT: .Lcfi1602:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1883:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1884:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1885:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1886:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1887:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -53925,12 +52790,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -53953,93 +52813,78 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1888:
+; NoVLX-NEXT: .Lcfi1603:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1889:
+; NoVLX-NEXT: .Lcfi1604:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1890:
+; NoVLX-NEXT: .Lcfi1605:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1891:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1892:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1893:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1894:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1895:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54048,12 +52893,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -54077,93 +52917,78 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1896:
+; NoVLX-NEXT: .Lcfi1606:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1897:
+; NoVLX-NEXT: .Lcfi1607:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1898:
+; NoVLX-NEXT: .Lcfi1608:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1899:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1900:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1901:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1902:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1903:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54172,12 +52997,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -54203,94 +53023,79 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1904:
+; NoVLX-NEXT: .Lcfi1609:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1905:
+; NoVLX-NEXT: .Lcfi1610:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1906:
+; NoVLX-NEXT: .Lcfi1611:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1907:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1908:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1909:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1910:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1911:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54299,12 +53104,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -54330,94 +53130,79 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1912:
+; NoVLX-NEXT: .Lcfi1612:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1913:
+; NoVLX-NEXT: .Lcfi1613:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1914:
+; NoVLX-NEXT: .Lcfi1614:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1915:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1916:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1917:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1918:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1919:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54426,12 +53211,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -54458,94 +53238,79 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1920:
+; NoVLX-NEXT: .Lcfi1615:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1921:
+; NoVLX-NEXT: .Lcfi1616:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1922:
+; NoVLX-NEXT: .Lcfi1617:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1923:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1924:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1925:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1926:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1927:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kxorw %k0, %k0, %k1
; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: vmovd %ecx, %xmm0
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $8, %k0, %k1
+; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $7, %k0, %k1
+; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $6, %k0, %k1
+; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $5, %k0, %k1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $4, %k0, %k1
+; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $3, %k0, %k1
+; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
+; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
+; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -54554,12 +53319,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
; NoVLX-NEXT: retq
entry:
@@ -54634,12 +53394,12 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1928:
+; NoVLX-NEXT: .Lcfi1618:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1929:
+; NoVLX-NEXT: .Lcfi1619:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1930:
+; NoVLX-NEXT: .Lcfi1620:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -54648,21 +53408,17 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1931:
+; NoVLX-NEXT: .Lcfi1621:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1932:
+; NoVLX-NEXT: .Lcfi1622:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1933:
+; NoVLX-NEXT: .Lcfi1623:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1934:
+; NoVLX-NEXT: .Lcfi1624:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1935:
+; NoVLX-NEXT: .Lcfi1625:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -54705,11 +53461,11 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -54721,11 +53477,15 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -54762,12 +53522,12 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1936:
+; NoVLX-NEXT: .Lcfi1626:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1937:
+; NoVLX-NEXT: .Lcfi1627:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1938:
+; NoVLX-NEXT: .Lcfi1628:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -54776,21 +53536,17 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1939:
+; NoVLX-NEXT: .Lcfi1629:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1940:
+; NoVLX-NEXT: .Lcfi1630:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1941:
+; NoVLX-NEXT: .Lcfi1631:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1942:
+; NoVLX-NEXT: .Lcfi1632:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1943:
+; NoVLX-NEXT: .Lcfi1633:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -54833,11 +53589,11 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -54849,11 +53605,15 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -54891,12 +53651,12 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1944:
+; NoVLX-NEXT: .Lcfi1634:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1945:
+; NoVLX-NEXT: .Lcfi1635:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1946:
+; NoVLX-NEXT: .Lcfi1636:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -54905,21 +53665,17 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1947:
+; NoVLX-NEXT: .Lcfi1637:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1948:
+; NoVLX-NEXT: .Lcfi1638:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1949:
+; NoVLX-NEXT: .Lcfi1639:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1950:
+; NoVLX-NEXT: .Lcfi1640:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1951:
+; NoVLX-NEXT: .Lcfi1641:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -54962,11 +53718,11 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -54978,11 +53734,15 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55022,12 +53782,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1952:
+; NoVLX-NEXT: .Lcfi1642:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1953:
+; NoVLX-NEXT: .Lcfi1643:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1954:
+; NoVLX-NEXT: .Lcfi1644:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -55036,22 +53796,18 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1955:
+; NoVLX-NEXT: .Lcfi1645:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1956:
+; NoVLX-NEXT: .Lcfi1646:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1957:
+; NoVLX-NEXT: .Lcfi1647:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1958:
+; NoVLX-NEXT: .Lcfi1648:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1959:
+; NoVLX-NEXT: .Lcfi1649:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -55094,11 +53850,11 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -55110,11 +53866,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55154,12 +53914,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1960:
+; NoVLX-NEXT: .Lcfi1650:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1961:
+; NoVLX-NEXT: .Lcfi1651:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1962:
+; NoVLX-NEXT: .Lcfi1652:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -55168,22 +53928,18 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1963:
+; NoVLX-NEXT: .Lcfi1653:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1964:
+; NoVLX-NEXT: .Lcfi1654:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1965:
+; NoVLX-NEXT: .Lcfi1655:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1966:
+; NoVLX-NEXT: .Lcfi1656:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1967:
+; NoVLX-NEXT: .Lcfi1657:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -55226,11 +53982,11 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -55242,11 +53998,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55287,12 +54047,12 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1968:
+; NoVLX-NEXT: .Lcfi1658:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1969:
+; NoVLX-NEXT: .Lcfi1659:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1970:
+; NoVLX-NEXT: .Lcfi1660:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: pushq %r15
; NoVLX-NEXT: pushq %r14
@@ -55301,22 +54061,18 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1971:
+; NoVLX-NEXT: .Lcfi1661:
; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1972:
+; NoVLX-NEXT: .Lcfi1662:
; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1973:
+; NoVLX-NEXT: .Lcfi1663:
; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1974:
+; NoVLX-NEXT: .Lcfi1664:
; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1975:
+; NoVLX-NEXT: .Lcfi1665:
; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
@@ -55359,11 +54115,11 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: kshiftlw $2, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $1, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k1, %r8d
; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -55375,11 +54131,15 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55598,8 +54358,8 @@ define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2
; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -56207,12 +54967,12 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1976:
+; NoVLX-NEXT: .Lcfi1666:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1977:
+; NoVLX-NEXT: .Lcfi1667:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1978:
+; NoVLX-NEXT: .Lcfi1668:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -56250,12 +55010,12 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1979:
+; NoVLX-NEXT: .Lcfi1669:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1980:
+; NoVLX-NEXT: .Lcfi1670:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1981:
+; NoVLX-NEXT: .Lcfi1671:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -56294,12 +55054,12 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1982:
+; NoVLX-NEXT: .Lcfi1672:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1983:
+; NoVLX-NEXT: .Lcfi1673:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1984:
+; NoVLX-NEXT: .Lcfi1674:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -56342,12 +55102,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1985:
+; NoVLX-NEXT: .Lcfi1675:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1986:
+; NoVLX-NEXT: .Lcfi1676:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1987:
+; NoVLX-NEXT: .Lcfi1677:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -56394,20 +55154,20 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1988:
+; NoVLX-NEXT: .Lcfi1678:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1989:
+; NoVLX-NEXT: .Lcfi1679:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1990:
+; NoVLX-NEXT: .Lcfi1680:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -56447,12 +55207,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1991:
+; NoVLX-NEXT: .Lcfi1681:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1992:
+; NoVLX-NEXT: .Lcfi1682:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1993:
+; NoVLX-NEXT: .Lcfi1683:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -56502,20 +55262,20 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1994:
+; NoVLX-NEXT: .Lcfi1684:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1995:
+; NoVLX-NEXT: .Lcfi1685:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1996:
+; NoVLX-NEXT: .Lcfi1686:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -56551,20 +55311,20 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1997:
+; NoVLX-NEXT: .Lcfi1687:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1998:
+; NoVLX-NEXT: .Lcfi1688:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1999:
+; NoVLX-NEXT: .Lcfi1689:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -56601,12 +55361,12 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2000:
+; NoVLX-NEXT: .Lcfi1690:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2001:
+; NoVLX-NEXT: .Lcfi1691:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2002:
+; NoVLX-NEXT: .Lcfi1692:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -56614,8 +55374,8 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -56655,12 +55415,12 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2003:
+; NoVLX-NEXT: .Lcfi1693:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2004:
+; NoVLX-NEXT: .Lcfi1694:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2005:
+; NoVLX-NEXT: .Lcfi1695:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -56672,8 +55432,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -56713,12 +55473,12 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2006:
+; NoVLX-NEXT: .Lcfi1696:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2007:
+; NoVLX-NEXT: .Lcfi1697:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2008:
+; NoVLX-NEXT: .Lcfi1698:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -56730,8 +55490,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -56772,12 +55532,12 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2009:
+; NoVLX-NEXT: .Lcfi1699:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2010:
+; NoVLX-NEXT: .Lcfi1700:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2011:
+; NoVLX-NEXT: .Lcfi1701:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -56790,8 +55550,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -56852,8 +55612,8 @@ define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b)
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -56909,8 +55669,8 @@ define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -56968,8 +55728,8 @@ define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57035,8 +55795,8 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i6
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57102,8 +55862,8 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57171,8 +55931,8 @@ define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <
; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -57233,8 +55993,8 @@ define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57289,8 +56049,8 @@ define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57347,8 +56107,8 @@ define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57413,8 +56173,8 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57479,8 +56239,8 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57547,8 +56307,8 @@ define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -57592,12 +56352,12 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2012:
+; NoVLX-NEXT: .Lcfi1702:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2013:
+; NoVLX-NEXT: .Lcfi1703:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2014:
+; NoVLX-NEXT: .Lcfi1704:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -57637,12 +56397,12 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2015:
+; NoVLX-NEXT: .Lcfi1705:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2016:
+; NoVLX-NEXT: .Lcfi1706:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2017:
+; NoVLX-NEXT: .Lcfi1707:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -57683,12 +56443,12 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2018:
+; NoVLX-NEXT: .Lcfi1708:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2019:
+; NoVLX-NEXT: .Lcfi1709:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2020:
+; NoVLX-NEXT: .Lcfi1710:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -57733,12 +56493,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2021:
+; NoVLX-NEXT: .Lcfi1711:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2022:
+; NoVLX-NEXT: .Lcfi1712:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2023:
+; NoVLX-NEXT: .Lcfi1713:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -57788,12 +56548,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2024:
+; NoVLX-NEXT: .Lcfi1714:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2025:
+; NoVLX-NEXT: .Lcfi1715:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2026:
+; NoVLX-NEXT: .Lcfi1716:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -57844,12 +56604,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2027:
+; NoVLX-NEXT: .Lcfi1717:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2028:
+; NoVLX-NEXT: .Lcfi1718:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2029:
+; NoVLX-NEXT: .Lcfi1719:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -57902,12 +56662,12 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2030:
+; NoVLX-NEXT: .Lcfi1720:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2031:
+; NoVLX-NEXT: .Lcfi1721:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2032:
+; NoVLX-NEXT: .Lcfi1722:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -57915,8 +56675,8 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -57953,12 +56713,12 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2033:
+; NoVLX-NEXT: .Lcfi1723:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2034:
+; NoVLX-NEXT: .Lcfi1724:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2035:
+; NoVLX-NEXT: .Lcfi1725:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -57966,8 +56726,8 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -58005,12 +56765,12 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2036:
+; NoVLX-NEXT: .Lcfi1726:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2037:
+; NoVLX-NEXT: .Lcfi1727:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2038:
+; NoVLX-NEXT: .Lcfi1728:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -58019,8 +56779,8 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -58061,12 +56821,12 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2039:
+; NoVLX-NEXT: .Lcfi1729:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2040:
+; NoVLX-NEXT: .Lcfi1730:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2041:
+; NoVLX-NEXT: .Lcfi1731:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -58080,8 +56840,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -58122,12 +56882,12 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2042:
+; NoVLX-NEXT: .Lcfi1732:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2043:
+; NoVLX-NEXT: .Lcfi1733:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2044:
+; NoVLX-NEXT: .Lcfi1734:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -58141,8 +56901,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -58184,12 +56944,12 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2045:
+; NoVLX-NEXT: .Lcfi1735:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2046:
+; NoVLX-NEXT: .Lcfi1736:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2047:
+; NoVLX-NEXT: .Lcfi1737:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -58204,8 +56964,8 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
@@ -58466,12 +57226,12 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2048:
+; NoVLX-NEXT: .Lcfi1738:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2049:
+; NoVLX-NEXT: .Lcfi1739:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2050:
+; NoVLX-NEXT: .Lcfi1740:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -58539,12 +57299,12 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2051:
+; NoVLX-NEXT: .Lcfi1741:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2052:
+; NoVLX-NEXT: .Lcfi1742:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2053:
+; NoVLX-NEXT: .Lcfi1743:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -58613,12 +57373,12 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2054:
+; NoVLX-NEXT: .Lcfi1744:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2055:
+; NoVLX-NEXT: .Lcfi1745:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2056:
+; NoVLX-NEXT: .Lcfi1746:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -58689,12 +57449,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2057:
+; NoVLX-NEXT: .Lcfi1747:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2058:
+; NoVLX-NEXT: .Lcfi1748:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2059:
+; NoVLX-NEXT: .Lcfi1749:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -58766,12 +57526,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2060:
+; NoVLX-NEXT: .Lcfi1750:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2061:
+; NoVLX-NEXT: .Lcfi1751:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2062:
+; NoVLX-NEXT: .Lcfi1752:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -58844,12 +57604,12 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2063:
+; NoVLX-NEXT: .Lcfi1753:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2064:
+; NoVLX-NEXT: .Lcfi1754:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2065:
+; NoVLX-NEXT: .Lcfi1755:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -58972,53 +57732,53 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2066:
+; NoVLX-NEXT: .Lcfi1756:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2067:
+; NoVLX-NEXT: .Lcfi1757:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2068:
+; NoVLX-NEXT: .Lcfi1758:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -59050,53 +57810,53 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2069:
+; NoVLX-NEXT: .Lcfi1759:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2070:
+; NoVLX-NEXT: .Lcfi1760:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2071:
+; NoVLX-NEXT: .Lcfi1761:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -59129,53 +57889,53 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2072:
+; NoVLX-NEXT: .Lcfi1762:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2073:
+; NoVLX-NEXT: .Lcfi1763:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2074:
+; NoVLX-NEXT: .Lcfi1764:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -59210,54 +57970,54 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2075:
+; NoVLX-NEXT: .Lcfi1765:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2076:
+; NoVLX-NEXT: .Lcfi1766:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2077:
+; NoVLX-NEXT: .Lcfi1767:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -59292,54 +58052,54 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2078:
+; NoVLX-NEXT: .Lcfi1768:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2079:
+; NoVLX-NEXT: .Lcfi1769:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2080:
+; NoVLX-NEXT: .Lcfi1770:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -59375,54 +58135,54 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
; NoVLX: # BB#0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi2081:
+; NoVLX-NEXT: .Lcfi1771:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi2082:
+; NoVLX-NEXT: .Lcfi1772:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2083:
+; NoVLX-NEXT: .Lcfi1773:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kshiftlw $15, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $14, %k0, %k1
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $13, %k0, %k1
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $12, %k0, %k1
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $11, %k0, %k1
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $10, %k0, %k1
+; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k1, %ecx
; NoVLX-NEXT: kshiftlw $9, %k0, %k1
+; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k1, %eax
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %ecx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/bmi-schedule.ll b/llvm/test/CodeGen/X86/bmi-schedule.ll
index 10412718a37..79f906ffcf7 100644
--- a/llvm/test/CodeGen/X86/bmi-schedule.ll
+++ b/llvm/test/CodeGen/X86/bmi-schedule.ll
@@ -20,10 +20,10 @@ define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: notl %edi # sched: [1:0.25]
-; HASWELL-NEXT: andw (%rdx), %di # sched: [5:0.50]
+; HASWELL-NEXT: andw (%rdx), %di # sched: [1:0.50]
; HASWELL-NEXT: addl %edi, %eax # sched: [1:0.25]
; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andn_i16:
; BTVER2: # BB#0:
@@ -61,9 +61,9 @@ define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
; HASWELL-LABEL: test_andn_i32:
; HASWELL: # BB#0:
; HASWELL-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [4:0.50]
+; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andn_i32:
; BTVER2: # BB#0:
@@ -97,9 +97,9 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
; HASWELL-LABEL: test_andn_i64:
; HASWELL: # BB#0:
; HASWELL-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:0.50]
+; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andn_i64:
; BTVER2: # BB#0:
@@ -132,10 +132,10 @@ define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
;
; HASWELL-LABEL: test_bextr_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
+; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [2:0.50]
; HASWELL-NEXT: bextrl %edi, %esi, %eax # sched: [2:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_bextr_i32:
; BTVER2: # BB#0:
@@ -168,10 +168,10 @@ define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
;
; HASWELL-LABEL: test_bextr_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [2:0.50]
; HASWELL-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_bextr_i64:
; BTVER2: # BB#0:
@@ -204,10 +204,10 @@ define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
;
; HASWELL-LABEL: test_blsi_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [1:0.50]
; HASWELL-NEXT: blsil %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blsi_i32:
; BTVER2: # BB#0:
@@ -241,10 +241,10 @@ define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_blsi_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [1:0.50]
; HASWELL-NEXT: blsiq %rdi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blsi_i64:
; BTVER2: # BB#0:
@@ -278,10 +278,10 @@ define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
;
; HASWELL-LABEL: test_blsmsk_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [1:0.50]
; HASWELL-NEXT: blsmskl %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blsmsk_i32:
; BTVER2: # BB#0:
@@ -315,10 +315,10 @@ define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_blsmsk_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [1:0.50]
; HASWELL-NEXT: blsmskq %rdi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blsmsk_i64:
; BTVER2: # BB#0:
@@ -352,10 +352,10 @@ define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
;
; HASWELL-LABEL: test_blsr_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [1:0.50]
; HASWELL-NEXT: blsrl %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blsr_i32:
; BTVER2: # BB#0:
@@ -389,10 +389,10 @@ define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_blsr_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [1:0.50]
; HASWELL-NEXT: blsrq %rdi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blsr_i64:
; BTVER2: # BB#0:
@@ -427,11 +427,11 @@ define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
;
; HASWELL-LABEL: test_cttz_i16:
; HASWELL: # BB#0:
-; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [3:1.00]
; HASWELL-NEXT: tzcntw %di, %ax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cttz_i16:
; BTVER2: # BB#0:
@@ -466,10 +466,10 @@ define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
;
; HASWELL-LABEL: test_cttz_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [3:1.00]
; HASWELL-NEXT: tzcntl %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cttz_i32:
; BTVER2: # BB#0:
@@ -502,10 +502,10 @@ define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_cttz_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [3:1.00]
; HASWELL-NEXT: tzcntq %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cttz_i64:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/bmi2-schedule.ll b/llvm/test/CodeGen/X86/bmi2-schedule.ll
index ec9820f5513..eda296089df 100644
--- a/llvm/test/CodeGen/X86/bmi2-schedule.ll
+++ b/llvm/test/CodeGen/X86/bmi2-schedule.ll
@@ -15,10 +15,10 @@ define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) {
;
; HASWELL-LABEL: test_bzhi_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [4:0.50]
+; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [1:0.50]
; HASWELL-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_bzhi_i32:
; ZNVER1: # BB#0:
@@ -44,10 +44,10 @@ define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
;
; HASWELL-LABEL: test_bzhi_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50]
+; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [1:0.50]
; HASWELL-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_bzhi_i64:
; ZNVER1: # BB#0:
@@ -80,9 +80,9 @@ define i64 @test_mulx_i64(i64 %a0, i64 %a1, i64 *%a2) {
; HASWELL-NEXT: movq %rdx, %rax # sched: [1:0.25]
; HASWELL-NEXT: movq %rdi, %rdx # sched: [1:0.25]
; HASWELL-NEXT: mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: mulxq (%rax), %rdx, %rax # sched: [8:1.00]
+; HASWELL-NEXT: mulxq (%rax), %rdx, %rax # sched: [4:1.00]
; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_mulx_i64:
; ZNVER1: # BB#0:
@@ -116,10 +116,10 @@ define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) {
;
; HASWELL-LABEL: test_pdep_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [3:1.00]
; HASWELL-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pdep_i32:
; ZNVER1: # BB#0:
@@ -145,10 +145,10 @@ define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) {
;
; HASWELL-LABEL: test_pdep_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [3:1.00]
; HASWELL-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pdep_i64:
; ZNVER1: # BB#0:
@@ -174,10 +174,10 @@ define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) {
;
; HASWELL-LABEL: test_pext_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [3:1.00]
; HASWELL-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pext_i32:
; ZNVER1: # BB#0:
@@ -203,10 +203,10 @@ define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) {
;
; HASWELL-LABEL: test_pext_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [3:1.00]
; HASWELL-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_pext_i64:
; ZNVER1: # BB#0:
@@ -233,9 +233,9 @@ define i32 @test_rorx_i32(i32 %a0, i32 %a1, i32 *%a2) {
; HASWELL-LABEL: test_rorx_i32:
; HASWELL: # BB#0:
; HASWELL-NEXT: rorxl $5, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT: rorxl $5, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT: rorxl $5, (%rdx), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_rorx_i32:
; ZNVER1: # BB#0:
@@ -265,9 +265,9 @@ define i64 @test_rorx_i64(i64 %a0, i64 %a1, i64 *%a2) {
; HASWELL-LABEL: test_rorx_i64:
; HASWELL: # BB#0:
; HASWELL-NEXT: rorxq $5, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT: rorxq $5, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT: rorxq $5, (%rdx), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_rorx_i64:
; ZNVER1: # BB#0:
@@ -297,9 +297,9 @@ define i32 @test_sarx_i32(i32 %a0, i32 %a1, i32 *%a2) {
; HASWELL-LABEL: test_sarx_i32:
; HASWELL: # BB#0:
; HASWELL-NEXT: sarxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT: sarxl %esi, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT: sarxl %esi, (%rdx), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sarx_i32:
; ZNVER1: # BB#0:
@@ -325,9 +325,9 @@ define i64 @test_sarx_i64(i64 %a0, i64 %a1, i64 *%a2) {
; HASWELL-LABEL: test_sarx_i64:
; HASWELL: # BB#0:
; HASWELL-NEXT: sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT: sarxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT: sarxq %rsi, (%rdx), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sarx_i64:
; ZNVER1: # BB#0:
@@ -353,9 +353,9 @@ define i32 @test_shlx_i32(i32 %a0, i32 %a1, i32 *%a2) {
; HASWELL-LABEL: test_shlx_i32:
; HASWELL: # BB#0:
; HASWELL-NEXT: shlxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT: shlxl %esi, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT: shlxl %esi, (%rdx), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_shlx_i32:
; ZNVER1: # BB#0:
@@ -381,9 +381,9 @@ define i64 @test_shlx_i64(i64 %a0, i64 %a1, i64 *%a2) {
; HASWELL-LABEL: test_shlx_i64:
; HASWELL: # BB#0:
; HASWELL-NEXT: shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT: shlxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT: shlxq %rsi, (%rdx), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_shlx_i64:
; ZNVER1: # BB#0:
@@ -409,9 +409,9 @@ define i32 @test_shrx_i32(i32 %a0, i32 %a1, i32 *%a2) {
; HASWELL-LABEL: test_shrx_i32:
; HASWELL: # BB#0:
; HASWELL-NEXT: shrxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT: shrxl %esi, (%rdx), %eax # sched: [5:0.50]
+; HASWELL-NEXT: shrxl %esi, (%rdx), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_shrx_i32:
; ZNVER1: # BB#0:
@@ -437,9 +437,9 @@ define i64 @test_shrx_i64(i64 %a0, i64 %a1, i64 *%a2) {
; HASWELL-LABEL: test_shrx_i64:
; HASWELL: # BB#0:
; HASWELL-NEXT: shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT: shrxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; HASWELL-NEXT: shrxq %rsi, (%rdx), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_shrx_i64:
; ZNVER1: # BB#0:
diff --git a/llvm/test/CodeGen/X86/f16c-schedule.ll b/llvm/test/CodeGen/X86/f16c-schedule.ll
index 1a17b8bdbca..24b7da2a229 100644
--- a/llvm/test/CodeGen/X86/f16c-schedule.ll
+++ b/llvm/test/CodeGen/X86/f16c-schedule.ll
@@ -23,10 +23,10 @@ define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
;
; HASWELL-LABEL: test_vcvtph2ps_128:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
-; HASWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [1:1.00]
+; HASWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_vcvtph2ps_128:
; BTVER2: # BB#0:
@@ -66,10 +66,10 @@ define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
;
; HASWELL-LABEL: test_vcvtph2ps_256:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
-; HASWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [1:1.00]
+; HASWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [2:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_vcvtph2ps_256:
; BTVER2: # BB#0:
@@ -108,8 +108,8 @@ define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16>
; HASWELL-LABEL: test_vcvtps2ph_128:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_vcvtps2ph_128:
; BTVER2: # BB#0:
@@ -147,10 +147,10 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16>
;
; HASWELL-LABEL: test_vcvtps2ph_256:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:?]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [6:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_vcvtps2ph_256:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/lea32-schedule.ll b/llvm/test/CodeGen/X86/lea32-schedule.ll
index eb4e3009491..32b892badb0 100644
--- a/llvm/test/CodeGen/X86/lea32-schedule.ll
+++ b/llvm/test/CodeGen/X86/lea32-schedule.ll
@@ -45,7 +45,7 @@ define i32 @test_lea_offset(i32) {
; HASWELL: # BB#0:
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_offset:
; BTVER2: # BB#0:
@@ -97,7 +97,7 @@ define i32 @test_lea_offset_big(i32) {
; HASWELL: # BB#0:
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_offset_big:
; BTVER2: # BB#0:
@@ -155,7 +155,7 @@ define i32 @test_lea_add(i32, i32) {
; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add:
; BTVER2: # BB#0:
@@ -217,7 +217,7 @@ define i32 @test_lea_add_offset(i32, i32) {
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl $16, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_offset:
; BTVER2: # BB#0:
@@ -283,7 +283,7 @@ define i32 @test_lea_add_offset_big(i32, i32) {
; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl $-4096, %eax # imm = 0xF000
; HASWELL-NEXT: # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_offset_big:
; BTVER2: # BB#0:
@@ -338,7 +338,7 @@ define i32 @test_lea_mul(i32) {
; HASWELL: # BB#0:
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_mul:
; BTVER2: # BB#0:
@@ -393,7 +393,7 @@ define i32 @test_lea_mul_offset(i32) {
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl $-32, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_mul_offset:
; BTVER2: # BB#0:
@@ -452,7 +452,7 @@ define i32 @test_lea_mul_offset_big(i32) {
; HASWELL-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl $10000, %eax # imm = 0x2710
; HASWELL-NEXT: # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_mul_offset_big:
; BTVER2: # BB#0:
@@ -510,7 +510,7 @@ define i32 @test_lea_add_scale(i32, i32) {
; HASWELL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_scale:
; BTVER2: # BB#0:
@@ -573,7 +573,7 @@ define i32 @test_lea_add_scale_offset(i32, i32) {
; HASWELL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HASWELL-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl $96, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_scale_offset:
; BTVER2: # BB#0:
@@ -640,7 +640,7 @@ define i32 @test_lea_add_scale_offset_big(i32, i32) {
; HASWELL-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
; HASWELL-NEXT: addl $-1200, %eax # imm = 0xFB50
; HASWELL-NEXT: # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_scale_offset_big:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/lea64-schedule.ll b/llvm/test/CodeGen/X86/lea64-schedule.ll
index dd629c33ed0..e7b29601045 100644
--- a/llvm/test/CodeGen/X86/lea64-schedule.ll
+++ b/llvm/test/CodeGen/X86/lea64-schedule.ll
@@ -40,7 +40,7 @@ define i64 @test_lea_offset(i64) {
; HASWELL-LABEL: test_lea_offset:
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_offset:
; BTVER2: # BB#0:
@@ -85,7 +85,7 @@ define i64 @test_lea_offset_big(i64) {
; HASWELL-LABEL: test_lea_offset_big:
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_offset_big:
; BTVER2: # BB#0:
@@ -131,7 +131,7 @@ define i64 @test_lea_add(i64, i64) {
; HASWELL-LABEL: test_lea_add:
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add:
; BTVER2: # BB#0:
@@ -179,7 +179,7 @@ define i64 @test_lea_add_offset(i64, i64) {
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq $16, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_offset:
; BTVER2: # BB#0:
@@ -231,7 +231,7 @@ define i64 @test_lea_add_offset_big(i64, i64) {
; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq $-4096, %rax # imm = 0xF000
; HASWELL-NEXT: # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_offset_big:
; BTVER2: # BB#0:
@@ -277,7 +277,7 @@ define i64 @test_lea_mul(i64) {
; HASWELL-LABEL: test_lea_mul:
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_mul:
; BTVER2: # BB#0:
@@ -325,7 +325,7 @@ define i64 @test_lea_mul_offset(i64) {
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq $-32, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_mul_offset:
; BTVER2: # BB#0:
@@ -377,7 +377,7 @@ define i64 @test_lea_mul_offset_big(i64) {
; HASWELL-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq $10000, %rax # imm = 0x2710
; HASWELL-NEXT: # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_mul_offset_big:
; BTVER2: # BB#0:
@@ -423,7 +423,7 @@ define i64 @test_lea_add_scale(i64, i64) {
; HASWELL-LABEL: test_lea_add_scale:
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_scale:
; BTVER2: # BB#0:
@@ -472,7 +472,7 @@ define i64 @test_lea_add_scale_offset(i64, i64) {
; HASWELL: # BB#0:
; HASWELL-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq $96, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_scale_offset:
; BTVER2: # BB#0:
@@ -525,7 +525,7 @@ define i64 @test_lea_add_scale_offset_big(i64, i64) {
; HASWELL-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
; HASWELL-NEXT: addq $-1200, %rax # imm = 0xFB50
; HASWELL-NEXT: # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lea_add_scale_offset_big:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/lzcnt-schedule.ll b/llvm/test/CodeGen/X86/lzcnt-schedule.ll
index 64441c7e895..d321ffbab7a 100644
--- a/llvm/test/CodeGen/X86/lzcnt-schedule.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-schedule.ll
@@ -17,11 +17,11 @@ define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
;
; HASWELL-LABEL: test_ctlz_i16:
; HASWELL: # BB#0:
-; HASWELL-NEXT: lzcntw (%rsi), %cx
-; HASWELL-NEXT: lzcntw %di, %ax
+; HASWELL-NEXT: lzcntw (%rsi), %cx # sched: [3:1.00]
+; HASWELL-NEXT: lzcntw %di, %ax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctlz_i16:
; BTVER2: # BB#0:
@@ -56,10 +56,10 @@ define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
;
; HASWELL-LABEL: test_ctlz_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: lzcntl (%rsi), %ecx
-; HASWELL-NEXT: lzcntl %edi, %eax
+; HASWELL-NEXT: lzcntl (%rsi), %ecx # sched: [3:1.00]
+; HASWELL-NEXT: lzcntl %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctlz_i32:
; BTVER2: # BB#0:
@@ -92,10 +92,10 @@ define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_ctlz_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: lzcntq (%rsi), %rcx
-; HASWELL-NEXT: lzcntq %rdi, %rax
+; HASWELL-NEXT: lzcntq (%rsi), %rcx # sched: [3:1.00]
+; HASWELL-NEXT: lzcntq %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctlz_i64:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/movbe-schedule.ll b/llvm/test/CodeGen/X86/movbe-schedule.ll
index 7091183c23f..9b17f0dc763 100644
--- a/llvm/test/CodeGen/X86/movbe-schedule.ll
+++ b/llvm/test/CodeGen/X86/movbe-schedule.ll
@@ -33,9 +33,9 @@ define i16 @test_ctlz_i16(i16 *%a0, i16 %a1, i16 *%a2) {
;
; HASWELL-LABEL: test_ctlz_i16:
; HASWELL: # BB#0:
-; HASWELL-NEXT: movbew (%rdi), %ax # sched: [6:0.50]
+; HASWELL-NEXT: movbew (%rdi), %ax # sched: [1:0.50]
; HASWELL-NEXT: movbew %si, (%rdx) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctlz_i16:
; BTVER2: # BB#0:
@@ -83,7 +83,7 @@ define i32 @test_ctlz_i32(i32 *%a0, i32 %a1, i32 *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: movbel (%rdi), %eax # sched: [1:0.50]
; HASWELL-NEXT: movbel %esi, (%rdx) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctlz_i32:
; BTVER2: # BB#0:
@@ -129,9 +129,9 @@ define i64 @test_ctlz_i64(i64 *%a0, i64 %a1, i64 *%a2) {
;
; HASWELL-LABEL: test_ctlz_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: movbeq (%rdi), %rax # sched: [6:0.50]
+; HASWELL-NEXT: movbeq (%rdi), %rax # sched: [1:0.50]
; HASWELL-NEXT: movbeq %rsi, (%rdx) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctlz_i64:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index d545b477e10..38599f6fa19 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -17,7 +17,7 @@ define i32 @test_mul_by_1(i32 %x) {
; X64-HSW-LABEL: test_mul_by_1:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_1:
; X64-JAG: # BB#0:
@@ -32,7 +32,7 @@ define i32 @test_mul_by_1(i32 %x) {
; HSW-NOOPT-LABEL: test_mul_by_1:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_1:
; JAG-NOOPT: # BB#0:
@@ -63,7 +63,7 @@ define i32 @test_mul_by_2(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_2:
; X64-JAG: # BB#0:
@@ -81,7 +81,7 @@ define i32 @test_mul_by_2(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_2:
; JAG-NOOPT: # BB#0:
@@ -114,7 +114,7 @@ define i32 @test_mul_by_3(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_3:
; X64-JAG: # BB#0:
@@ -131,7 +131,7 @@ define i32 @test_mul_by_3(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_3:
; JAG-NOOPT: # BB#0:
@@ -165,7 +165,7 @@ define i32 @test_mul_by_4(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_4:
; X64-JAG: # BB#0:
@@ -183,7 +183,7 @@ define i32 @test_mul_by_4(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_4:
; JAG-NOOPT: # BB#0:
@@ -216,7 +216,7 @@ define i32 @test_mul_by_5(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_5:
; X64-JAG: # BB#0:
@@ -233,7 +233,7 @@ define i32 @test_mul_by_5(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_5:
; JAG-NOOPT: # BB#0:
@@ -269,7 +269,7 @@ define i32 @test_mul_by_6(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_6:
; X64-JAG: # BB#0:
@@ -285,8 +285,8 @@ define i32 @test_mul_by_6(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_6:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_6:
; JAG-NOOPT: # BB#0:
@@ -321,7 +321,7 @@ define i32 @test_mul_by_7(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_7:
; X64-JAG: # BB#0:
@@ -337,8 +337,8 @@ define i32 @test_mul_by_7(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_7:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_7:
; JAG-NOOPT: # BB#0:
@@ -371,7 +371,7 @@ define i32 @test_mul_by_8(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_8:
; X64-JAG: # BB#0:
@@ -389,7 +389,7 @@ define i32 @test_mul_by_8(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_8:
; JAG-NOOPT: # BB#0:
@@ -422,7 +422,7 @@ define i32 @test_mul_by_9(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_9:
; X64-JAG: # BB#0:
@@ -439,7 +439,7 @@ define i32 @test_mul_by_9(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_9:
; JAG-NOOPT: # BB#0:
@@ -475,7 +475,7 @@ define i32 @test_mul_by_10(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_10:
; X64-JAG: # BB#0:
@@ -491,8 +491,8 @@ define i32 @test_mul_by_10(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_10:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_10:
; JAG-NOOPT: # BB#0:
@@ -527,7 +527,7 @@ define i32 @test_mul_by_11(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_11:
; X64-JAG: # BB#0:
@@ -543,8 +543,8 @@ define i32 @test_mul_by_11(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_11:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_11:
; JAG-NOOPT: # BB#0:
@@ -577,7 +577,7 @@ define i32 @test_mul_by_12(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_12:
; X64-JAG: # BB#0:
@@ -593,8 +593,8 @@ define i32 @test_mul_by_12(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_12:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_12:
; JAG-NOOPT: # BB#0:
@@ -629,7 +629,7 @@ define i32 @test_mul_by_13(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_13:
; X64-JAG: # BB#0:
@@ -645,8 +645,8 @@ define i32 @test_mul_by_13(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_13:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_13:
; JAG-NOOPT: # BB#0:
@@ -681,7 +681,7 @@ define i32 @test_mul_by_14(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_14:
; X64-JAG: # BB#0:
@@ -698,8 +698,8 @@ define i32 @test_mul_by_14(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_14:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_14:
; JAG-NOOPT: # BB#0:
@@ -732,7 +732,7 @@ define i32 @test_mul_by_15(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_15:
; X64-JAG: # BB#0:
@@ -748,8 +748,8 @@ define i32 @test_mul_by_15(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_15:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_15:
; JAG-NOOPT: # BB#0:
@@ -782,7 +782,7 @@ define i32 @test_mul_by_16(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_16:
; X64-JAG: # BB#0:
@@ -800,7 +800,7 @@ define i32 @test_mul_by_16(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_16:
; JAG-NOOPT: # BB#0:
@@ -838,7 +838,7 @@ define i32 @test_mul_by_17(i32 %x) {
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_17:
; X64-JAG: # BB#0:
@@ -855,8 +855,8 @@ define i32 @test_mul_by_17(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_17:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_17:
; JAG-NOOPT: # BB#0:
@@ -892,7 +892,7 @@ define i32 @test_mul_by_18(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_18:
; X64-JAG: # BB#0:
@@ -908,8 +908,8 @@ define i32 @test_mul_by_18(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_18:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_18:
; JAG-NOOPT: # BB#0:
@@ -946,7 +946,7 @@ define i32 @test_mul_by_19(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_19:
; X64-JAG: # BB#0:
@@ -963,8 +963,8 @@ define i32 @test_mul_by_19(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_19:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_19:
; JAG-NOOPT: # BB#0:
@@ -997,7 +997,7 @@ define i32 @test_mul_by_20(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_20:
; X64-JAG: # BB#0:
@@ -1013,8 +1013,8 @@ define i32 @test_mul_by_20(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_20:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_20:
; JAG-NOOPT: # BB#0:
@@ -1049,7 +1049,7 @@ define i32 @test_mul_by_21(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_21:
; X64-JAG: # BB#0:
@@ -1065,8 +1065,8 @@ define i32 @test_mul_by_21(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_21:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_21:
; JAG-NOOPT: # BB#0:
@@ -1101,7 +1101,7 @@ define i32 @test_mul_by_22(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_22:
; X64-JAG: # BB#0:
@@ -1118,8 +1118,8 @@ define i32 @test_mul_by_22(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_22:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_22:
; JAG-NOOPT: # BB#0:
@@ -1154,7 +1154,7 @@ define i32 @test_mul_by_23(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_23:
; X64-JAG: # BB#0:
@@ -1171,8 +1171,8 @@ define i32 @test_mul_by_23(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_23:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_23:
; JAG-NOOPT: # BB#0:
@@ -1205,7 +1205,7 @@ define i32 @test_mul_by_24(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_24:
; X64-JAG: # BB#0:
@@ -1221,8 +1221,8 @@ define i32 @test_mul_by_24(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_24:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_24:
; JAG-NOOPT: # BB#0:
@@ -1257,7 +1257,7 @@ define i32 @test_mul_by_25(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_25:
; X64-JAG: # BB#0:
@@ -1273,8 +1273,8 @@ define i32 @test_mul_by_25(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_25:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_25:
; JAG-NOOPT: # BB#0:
@@ -1311,7 +1311,7 @@ define i32 @test_mul_by_26(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_26:
; X64-JAG: # BB#0:
@@ -1328,8 +1328,8 @@ define i32 @test_mul_by_26(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_26:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_26:
; JAG-NOOPT: # BB#0:
@@ -1362,7 +1362,7 @@ define i32 @test_mul_by_27(i32 %x) {
; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_27:
; X64-JAG: # BB#0:
@@ -1378,8 +1378,8 @@ define i32 @test_mul_by_27(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_27:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_27:
; JAG-NOOPT: # BB#0:
@@ -1416,7 +1416,7 @@ define i32 @test_mul_by_28(i32 %x) {
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_28:
; X64-JAG: # BB#0:
@@ -1433,8 +1433,8 @@ define i32 @test_mul_by_28(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_28:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_28:
; JAG-NOOPT: # BB#0:
@@ -1471,7 +1471,7 @@ define i32 @test_mul_by_29(i32 %x) {
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_29:
; X64-JAG: # BB#0:
@@ -1489,8 +1489,8 @@ define i32 @test_mul_by_29(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_29:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_29:
; JAG-NOOPT: # BB#0:
@@ -1526,7 +1526,7 @@ define i32 @test_mul_by_30(i32 %x) {
; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_30:
; X64-JAG: # BB#0:
@@ -1543,8 +1543,8 @@ define i32 @test_mul_by_30(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_30:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_30:
; JAG-NOOPT: # BB#0:
@@ -1578,7 +1578,7 @@ define i32 @test_mul_by_31(i32 %x) {
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_31:
; X64-JAG: # BB#0:
@@ -1594,8 +1594,8 @@ define i32 @test_mul_by_31(i32 %x) {
;
; HSW-NOOPT-LABEL: test_mul_by_31:
; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_31:
; JAG-NOOPT: # BB#0:
@@ -1628,7 +1628,7 @@ define i32 @test_mul_by_32(i32 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_32:
; X64-JAG: # BB#0:
@@ -1646,7 +1646,7 @@ define i32 @test_mul_by_32(i32 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_32:
; JAG-NOOPT: # BB#0:
@@ -1687,7 +1687,7 @@ define i32 @test_mul_spec(i32 %x) nounwind {
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25]
; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_spec:
; X64-JAG: # BB#0:
@@ -1713,7 +1713,7 @@ define i32 @test_mul_spec(i32 %x) nounwind {
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25]
; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_spec:
; JAG-NOOPT: # BB#0:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index ea841c761c7..98568a6fc8e 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -18,7 +18,7 @@ define i64 @test_mul_by_1(i64 %x) nounwind {
; X64-HSW-LABEL: test_mul_by_1:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_1:
; X64-JAG: # BB#0:
@@ -34,7 +34,7 @@ define i64 @test_mul_by_1(i64 %x) nounwind {
; HSW-NOOPT-LABEL: test_mul_by_1:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_1:
; JAG-NOOPT: # BB#0:
@@ -66,7 +66,7 @@ define i64 @test_mul_by_2(i64 %x) {
; X64-HSW-LABEL: test_mul_by_2:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_2:
; X64-JAG: # BB#0:
@@ -84,7 +84,7 @@ define i64 @test_mul_by_2(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_2:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_2:
; JAG-NOOPT: # BB#0:
@@ -116,7 +116,7 @@ define i64 @test_mul_by_3(i64 %x) {
; X64-HSW-LABEL: test_mul_by_3:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_3:
; X64-JAG: # BB#0:
@@ -134,7 +134,7 @@ define i64 @test_mul_by_3(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_3:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_3:
; JAG-NOOPT: # BB#0:
@@ -166,7 +166,7 @@ define i64 @test_mul_by_4(i64 %x) {
; X64-HSW-LABEL: test_mul_by_4:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_4:
; X64-JAG: # BB#0:
@@ -184,7 +184,7 @@ define i64 @test_mul_by_4(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_4:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_4:
; JAG-NOOPT: # BB#0:
@@ -216,7 +216,7 @@ define i64 @test_mul_by_5(i64 %x) {
; X64-HSW-LABEL: test_mul_by_5:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_5:
; X64-JAG: # BB#0:
@@ -234,7 +234,7 @@ define i64 @test_mul_by_5(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_5:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_5:
; JAG-NOOPT: # BB#0:
@@ -268,7 +268,7 @@ define i64 @test_mul_by_6(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_6:
; X64-JAG: # BB#0:
@@ -287,7 +287,7 @@ define i64 @test_mul_by_6(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_6:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_6:
; JAG-NOOPT: # BB#0:
@@ -323,7 +323,7 @@ define i64 @test_mul_by_7(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_7:
; X64-JAG: # BB#0:
@@ -342,7 +342,7 @@ define i64 @test_mul_by_7(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_7:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_7:
; JAG-NOOPT: # BB#0:
@@ -375,7 +375,7 @@ define i64 @test_mul_by_8(i64 %x) {
; X64-HSW-LABEL: test_mul_by_8:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_8:
; X64-JAG: # BB#0:
@@ -393,7 +393,7 @@ define i64 @test_mul_by_8(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_8:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_8:
; JAG-NOOPT: # BB#0:
@@ -425,7 +425,7 @@ define i64 @test_mul_by_9(i64 %x) {
; X64-HSW-LABEL: test_mul_by_9:
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_9:
; X64-JAG: # BB#0:
@@ -443,7 +443,7 @@ define i64 @test_mul_by_9(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_9:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_9:
; JAG-NOOPT: # BB#0:
@@ -477,7 +477,7 @@ define i64 @test_mul_by_10(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_10:
; X64-JAG: # BB#0:
@@ -496,7 +496,7 @@ define i64 @test_mul_by_10(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_10:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_10:
; JAG-NOOPT: # BB#0:
@@ -532,7 +532,7 @@ define i64 @test_mul_by_11(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_11:
; X64-JAG: # BB#0:
@@ -551,7 +551,7 @@ define i64 @test_mul_by_11(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_11:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_11:
; JAG-NOOPT: # BB#0:
@@ -585,7 +585,7 @@ define i64 @test_mul_by_12(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_12:
; X64-JAG: # BB#0:
@@ -604,7 +604,7 @@ define i64 @test_mul_by_12(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_12:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_12:
; JAG-NOOPT: # BB#0:
@@ -640,7 +640,7 @@ define i64 @test_mul_by_13(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_13:
; X64-JAG: # BB#0:
@@ -659,7 +659,7 @@ define i64 @test_mul_by_13(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_13:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_13:
; JAG-NOOPT: # BB#0:
@@ -696,7 +696,7 @@ define i64 @test_mul_by_14(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_14:
; X64-JAG: # BB#0:
@@ -716,7 +716,7 @@ define i64 @test_mul_by_14(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_14:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_14:
; JAG-NOOPT: # BB#0:
@@ -751,7 +751,7 @@ define i64 @test_mul_by_15(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_15:
; X64-JAG: # BB#0:
@@ -770,7 +770,7 @@ define i64 @test_mul_by_15(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_15:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_15:
; JAG-NOOPT: # BB#0:
@@ -804,7 +804,7 @@ define i64 @test_mul_by_16(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_16:
; X64-JAG: # BB#0:
@@ -824,7 +824,7 @@ define i64 @test_mul_by_16(i64 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_16:
; JAG-NOOPT: # BB#0:
@@ -864,7 +864,7 @@ define i64 @test_mul_by_17(i64 %x) {
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_17:
; X64-JAG: # BB#0:
@@ -884,7 +884,7 @@ define i64 @test_mul_by_17(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_17:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_17:
; JAG-NOOPT: # BB#0:
@@ -920,7 +920,7 @@ define i64 @test_mul_by_18(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_18:
; X64-JAG: # BB#0:
@@ -939,7 +939,7 @@ define i64 @test_mul_by_18(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_18:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_18:
; JAG-NOOPT: # BB#0:
@@ -977,7 +977,7 @@ define i64 @test_mul_by_19(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_19:
; X64-JAG: # BB#0:
@@ -997,7 +997,7 @@ define i64 @test_mul_by_19(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_19:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_19:
; JAG-NOOPT: # BB#0:
@@ -1031,7 +1031,7 @@ define i64 @test_mul_by_20(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_20:
; X64-JAG: # BB#0:
@@ -1050,7 +1050,7 @@ define i64 @test_mul_by_20(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_20:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_20:
; JAG-NOOPT: # BB#0:
@@ -1086,7 +1086,7 @@ define i64 @test_mul_by_21(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_21:
; X64-JAG: # BB#0:
@@ -1105,7 +1105,7 @@ define i64 @test_mul_by_21(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_21:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_21:
; JAG-NOOPT: # BB#0:
@@ -1142,7 +1142,7 @@ define i64 @test_mul_by_22(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_22:
; X64-JAG: # BB#0:
@@ -1162,7 +1162,7 @@ define i64 @test_mul_by_22(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_22:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_22:
; JAG-NOOPT: # BB#0:
@@ -1199,7 +1199,7 @@ define i64 @test_mul_by_23(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_23:
; X64-JAG: # BB#0:
@@ -1219,7 +1219,7 @@ define i64 @test_mul_by_23(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_23:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_23:
; JAG-NOOPT: # BB#0:
@@ -1253,7 +1253,7 @@ define i64 @test_mul_by_24(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_24:
; X64-JAG: # BB#0:
@@ -1272,7 +1272,7 @@ define i64 @test_mul_by_24(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_24:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_24:
; JAG-NOOPT: # BB#0:
@@ -1308,7 +1308,7 @@ define i64 @test_mul_by_25(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_25:
; X64-JAG: # BB#0:
@@ -1327,7 +1327,7 @@ define i64 @test_mul_by_25(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_25:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_25:
; JAG-NOOPT: # BB#0:
@@ -1365,7 +1365,7 @@ define i64 @test_mul_by_26(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_26:
; X64-JAG: # BB#0:
@@ -1385,7 +1385,7 @@ define i64 @test_mul_by_26(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_26:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_26:
; JAG-NOOPT: # BB#0:
@@ -1420,7 +1420,7 @@ define i64 @test_mul_by_27(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_27:
; X64-JAG: # BB#0:
@@ -1439,7 +1439,7 @@ define i64 @test_mul_by_27(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_27:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_27:
; JAG-NOOPT: # BB#0:
@@ -1477,7 +1477,7 @@ define i64 @test_mul_by_28(i64 %x) {
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_28:
; X64-JAG: # BB#0:
@@ -1497,7 +1497,7 @@ define i64 @test_mul_by_28(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_28:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_28:
; JAG-NOOPT: # BB#0:
@@ -1536,7 +1536,7 @@ define i64 @test_mul_by_29(i64 %x) {
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_29:
; X64-JAG: # BB#0:
@@ -1557,7 +1557,7 @@ define i64 @test_mul_by_29(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_29:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_29:
; JAG-NOOPT: # BB#0:
@@ -1596,7 +1596,7 @@ define i64 @test_mul_by_30(i64 %x) {
; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_30:
; X64-JAG: # BB#0:
@@ -1617,7 +1617,7 @@ define i64 @test_mul_by_30(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_30:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_30:
; JAG-NOOPT: # BB#0:
@@ -1654,7 +1654,7 @@ define i64 @test_mul_by_31(i64 %x) {
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_31:
; X64-JAG: # BB#0:
@@ -1674,7 +1674,7 @@ define i64 @test_mul_by_31(i64 %x) {
; HSW-NOOPT-LABEL: test_mul_by_31:
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_31:
; JAG-NOOPT: # BB#0:
@@ -1709,7 +1709,7 @@ define i64 @test_mul_by_32(i64 %x) {
; X64-HSW: # BB#0:
; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_by_32:
; X64-JAG: # BB#0:
@@ -1729,7 +1729,7 @@ define i64 @test_mul_by_32(i64 %x) {
; HSW-NOOPT: # BB#0:
; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_32:
; JAG-NOOPT: # BB#0:
@@ -1792,8 +1792,8 @@ define i64 @test_mul_spec(i64 %x) nounwind {
; X64-HSW-NEXT: addq $42, %rcx # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: imulq %rcx, %rax # sched: [4:1.00]
+; X64-HSW-NEXT: retq # sched: [2:1.00]
;
; X64-JAG-LABEL: test_mul_spec:
; X64-JAG: # BB#0:
@@ -1840,8 +1840,8 @@ define i64 @test_mul_spec(i64 %x) nounwind {
; HSW-NOOPT-NEXT: addq $42, %rcx # sched: [1:0.25]
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [4:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [2:1.00]
;
; JAG-NOOPT-LABEL: test_mul_spec:
; JAG-NOOPT: # BB#0:
diff --git a/llvm/test/CodeGen/X86/popcnt-schedule.ll b/llvm/test/CodeGen/X86/popcnt-schedule.ll
index e6257484184..f393e6257b7 100644
--- a/llvm/test/CodeGen/X86/popcnt-schedule.ll
+++ b/llvm/test/CodeGen/X86/popcnt-schedule.ll
@@ -37,11 +37,11 @@ define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
;
; HASWELL-LABEL: test_ctpop_i16:
; HASWELL: # BB#0:
-; HASWELL-NEXT: popcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL-NEXT: popcntw (%rsi), %cx # sched: [3:1.00]
; HASWELL-NEXT: popcntw %di, %ax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctpop_i16:
; BTVER2: # BB#0:
@@ -90,10 +90,10 @@ define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
;
; HASWELL-LABEL: test_ctpop_i32:
; HASWELL: # BB#0:
-; HASWELL-NEXT: popcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL-NEXT: popcntl (%rsi), %ecx # sched: [3:1.00]
; HASWELL-NEXT: popcntl %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctpop_i32:
; BTVER2: # BB#0:
@@ -140,10 +140,10 @@ define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_ctpop_i64:
; HASWELL: # BB#0:
-; HASWELL-NEXT: popcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL-NEXT: popcntq (%rsi), %rcx # sched: [3:1.00]
; HASWELL-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ctpop_i64:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index e07340bf301..7983bc968a4 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -36,33 +36,33 @@ define void @foo() local_unnamed_addr {
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .Lcfi7:
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl obj, %edx
; X86-NEXT: movsbl var_27, %eax
-; X86-NEXT: movzwl var_2, %esi
; X86-NEXT: movl var_310, %ecx
; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: movl obj, %esi
; X86-NEXT: addl var_24, %ecx
-; X86-NEXT: andl $4194303, %edx # imm = 0x3FFFFF
-; X86-NEXT: leal (%edx,%edx), %ebx
-; X86-NEXT: subl %eax, %ebx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: subl %esi, %edi
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: movzwl var_2, %edi
+; X86-NEXT: andl $4194303, %esi # imm = 0x3FFFFF
+; X86-NEXT: leal (%esi,%esi), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: imull %ebx, %ecx
; X86-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71
-; X86-NEXT: movl $9, %esi
+; X86-NEXT: movl $9, %edi
; X86-NEXT: xorl %ebp, %ebp
-; X86-NEXT: shldl %cl, %esi, %ebp
-; X86-NEXT: shlxl %ecx, %esi, %esi
+; X86-NEXT: shldl %cl, %edi, %ebp
+; X86-NEXT: shlxl %ecx, %edi, %edi
; X86-NEXT: testb $32, %cl
-; X86-NEXT: cmovnel %esi, %ebp
+; X86-NEXT: cmovnel %edi, %ebp
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: cmovnel %ecx, %esi
-; X86-NEXT: cmpl %edx, %edi
+; X86-NEXT: cmovnel %ecx, %edi
; X86-NEXT: movl %ebp, var_50+4
-; X86-NEXT: movl %esi, var_50
+; X86-NEXT: cmpl %esi, %ebx
; X86-NEXT: setge var_205
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: movb %bl, var_218
+; X86-NEXT: imull %eax, %edx
+; X86-NEXT: movl %edi, var_50
+; X86-NEXT: movb %dl, var_218
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -71,25 +71,25 @@ define void @foo() local_unnamed_addr {
;
; X64-LABEL: foo:
; X64: # BB#0: # %entry
-; X64-NEXT: movl {{.*}}(%rip), %eax
-; X64-NEXT: movsbl {{.*}}(%rip), %r9d
+; X64-NEXT: movsbl {{.*}}(%rip), %eax
+; X64-NEXT: movl {{.*}}(%rip), %ecx
+; X64-NEXT: imull %eax, %ecx
+; X64-NEXT: movl {{.*}}(%rip), %edx
+; X64-NEXT: addl {{.*}}(%rip), %ecx
; X64-NEXT: movzwl {{.*}}(%rip), %r8d
-; X64-NEXT: movl {{.*}}(%rip), %esi
-; X64-NEXT: imull %r9d, %esi
-; X64-NEXT: addl {{.*}}(%rip), %esi
-; X64-NEXT: andl $4194303, %eax # imm = 0x3FFFFF
-; X64-NEXT: leal (%rax,%rax), %edi
-; X64-NEXT: subl %r9d, %edi
-; X64-NEXT: movl %edi, %edx
-; X64-NEXT: subl %r8d, %edx
-; X64-NEXT: imull %edx, %esi
-; X64-NEXT: addl $-1437483407, %esi # imm = 0xAA51BE71
-; X64-NEXT: movl $9, %ecx
-; X64-NEXT: shlxq %rsi, %rcx, %rcx
-; X64-NEXT: movq %rcx, {{.*}}(%rip)
-; X64-NEXT: cmpl %eax, %edx
+; X64-NEXT: andl $4194303, %edx # imm = 0x3FFFFF
+; X64-NEXT: leal (%rdx,%rdx), %edi
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: movl %edi, %esi
+; X64-NEXT: subl %r8d, %esi
+; X64-NEXT: imull %esi, %ecx
+; X64-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71
+; X64-NEXT: movl $9, %r8d
+; X64-NEXT: cmpl %edx, %esi
; X64-NEXT: setge {{.*}}(%rip)
-; X64-NEXT: imull %r9d, %edi
+; X64-NEXT: shlxq %rcx, %r8, %rcx
+; X64-NEXT: imull %eax, %edi
+; X64-NEXT: movq %rcx, {{.*}}(%rip)
; X64-NEXT: movb %dil, {{.*}}(%rip)
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index 437ac3c4f2c..e33c688b0a0 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -51,9 +51,9 @@ define float @f32_no_estimate(float %x) #0 {
;
; HASWELL-LABEL: f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
@@ -63,9 +63,9 @@ define float @f32_no_estimate(float %x) #0 {
;
; AVX512-LABEL: f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
@@ -122,9 +122,9 @@ define float @f32_one_step(float %x) #1 {
; HASWELL-LABEL: f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step:
; HASWELL-NO-FMA: # BB#0:
@@ -139,9 +139,9 @@ define float @f32_one_step(float %x) #1 {
; AVX512-LABEL: f32_one_step:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
@@ -220,13 +220,13 @@ define float @f32_two_step(float %x) #2 {
; HASWELL-LABEL: f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_two_step:
; HASWELL-NO-FMA: # BB#0:
@@ -245,13 +245,13 @@ define float @f32_two_step(float %x) #2 {
; AVX512-LABEL: f32_two_step:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
@@ -290,9 +290,9 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
;
; HASWELL-LABEL: v4f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
@@ -302,9 +302,9 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
;
; AVX512-LABEL: v4f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
-; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50]
+; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -361,10 +361,10 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; HASWELL-LABEL: v4f32_one_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step:
; HASWELL-NO-FMA: # BB#0:
@@ -379,17 +379,17 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; KNL-LABEL: v4f32_one_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_one_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -468,13 +468,13 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; HASWELL-LABEL: v4f32_two_step:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step:
; HASWELL-NO-FMA: # BB#0:
@@ -493,24 +493,24 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; KNL-LABEL: v4f32_two_step:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
@@ -552,9 +552,9 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
;
; HASWELL-LABEL: v8f32_no_estimate:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
; HASWELL-NO-FMA: # BB#0:
@@ -564,9 +564,9 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
;
; AVX512-LABEL: v8f32_no_estimate:
; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -629,11 +629,11 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; HASWELL-LABEL: v8f32_one_step:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step:
; HASWELL-NO-FMA: # BB#0:
@@ -647,18 +647,18 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; KNL-LABEL: v8f32_one_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_one_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -749,14 +749,14 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; HASWELL-LABEL: v8f32_two_step:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step:
; HASWELL-NO-FMA: # BB#0:
@@ -774,25 +774,25 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; KNL-LABEL: v8f32_two_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_two_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 7e159c09f96..ac63974d3ad 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -45,20 +45,20 @@ define float @f32_no_step_2(float %x) #3 {
; HASWELL-LABEL: f32_no_step_2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_no_step_2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_no_step_2:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 1234.0, %x
ret float %div
}
@@ -120,29 +120,29 @@ define float @f32_one_step_2(float %x) #1 {
; HASWELL-LABEL: f32_one_step_2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step_2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_one_step_2:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 3456.0, %x
ret float %div
}
@@ -209,32 +209,32 @@ define float @f32_one_step_2_divs(float %x) #1 {
; HASWELL-LABEL: f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_one_step_2_divs:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 3456.0, %x
%div2 = fdiv fast float %div, %x
ret float %div2
@@ -319,20 +319,20 @@ define float @f32_two_step_2(float %x) #2 {
; HASWELL-LABEL: f32_two_step_2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_two_step_2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -340,20 +340,20 @@ define float @f32_two_step_2(float %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; AVX512-LABEL: f32_two_step_2:
; AVX512: # BB#0:
; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; AVX512-NEXT: retq # sched: [2:1.00]
%div = fdiv fast float 6789.0, %x
ret float %div
}
@@ -415,39 +415,39 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; HASWELL-LABEL: v4f32_one_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v4f32_one_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_one_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
}
@@ -514,43 +514,43 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; HASWELL-LABEL: v4f32_one_step_2_divs:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v4f32_one_step_2_divs:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_one_step_2_divs:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
%div2 = fdiv fast <4 x float> %div, %x
ret <4 x float> %div2
@@ -635,20 +635,20 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-LABEL: v4f32_two_step2:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
; HASWELL-NO-FMA: # BB#0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -656,32 +656,32 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v4f32_two_step2:
; KNL: # BB#0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v4f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
}
@@ -750,40 +750,40 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
;
; HASWELL-LABEL: v8f32_one_step2:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_one_step2:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_one_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
@@ -858,44 +858,44 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
;
; HASWELL-LABEL: v8f32_one_step_2_divs:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_one_step_2_divs:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_one_step_2_divs:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50]
+; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
%div2 = fdiv fast <8 x float> %div, %x
ret <8 x float> %div2
@@ -993,54 +993,54 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
;
; HASWELL-LABEL: v8f32_two_step2:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_two_step2:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_two_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
@@ -1074,23 +1074,23 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
;
; HASWELL-LABEL: v8f32_no_step:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_step:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_no_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_no_step:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
@@ -1130,27 +1130,27 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
;
; HASWELL-LABEL: v8f32_no_step2:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00]
;
; KNL-LABEL: v8f32_no_step2:
; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: v8f32_no_step2:
; SKX: # BB#0:
; SKX-NEXT: vrcp14ps %ymm0, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [2:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
diff --git a/llvm/test/CodeGen/X86/sha-schedule.ll b/llvm/test/CodeGen/X86/sha-schedule.ll
index bf82ba8b2d9..bd9be7ecb46 100644
--- a/llvm/test/CodeGen/X86/sha-schedule.ll
+++ b/llvm/test/CodeGen/X86/sha-schedule.ll
@@ -25,7 +25,7 @@ define <4 x i32> @test_sha1msg1(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; CANNONLAKE: # BB#0:
; CANNONLAKE-NEXT: sha1msg1 %xmm1, %xmm0
; CANNONLAKE-NEXT: sha1msg1 (%rdi), %xmm0
-; CANNONLAKE-NEXT: retq # sched: [1:1.00]
+; CANNONLAKE-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sha1msg1:
; ZNVER1: # BB#0:
@@ -56,7 +56,7 @@ define <4 x i32> @test_sha1msg2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; CANNONLAKE: # BB#0:
; CANNONLAKE-NEXT: sha1msg2 %xmm1, %xmm0
; CANNONLAKE-NEXT: sha1msg2 (%rdi), %xmm0
-; CANNONLAKE-NEXT: retq # sched: [1:1.00]
+; CANNONLAKE-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sha1msg2:
; ZNVER1: # BB#0:
@@ -87,7 +87,7 @@ define <4 x i32> @test_sha1nexte(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; CANNONLAKE: # BB#0:
; CANNONLAKE-NEXT: sha1nexte %xmm1, %xmm0
; CANNONLAKE-NEXT: sha1nexte (%rdi), %xmm0
-; CANNONLAKE-NEXT: retq # sched: [1:1.00]
+; CANNONLAKE-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sha1nexte:
; ZNVER1: # BB#0:
@@ -118,7 +118,7 @@ define <4 x i32> @test_sha1rnds4(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; CANNONLAKE: # BB#0:
; CANNONLAKE-NEXT: sha1rnds4 $3, %xmm1, %xmm0
; CANNONLAKE-NEXT: sha1rnds4 $3, (%rdi), %xmm0
-; CANNONLAKE-NEXT: retq # sched: [1:1.00]
+; CANNONLAKE-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sha1rnds4:
; ZNVER1: # BB#0:
@@ -153,7 +153,7 @@ define <4 x i32> @test_sha256msg1(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2)
; CANNONLAKE: # BB#0:
; CANNONLAKE-NEXT: sha256msg1 %xmm1, %xmm0
; CANNONLAKE-NEXT: sha256msg1 (%rdi), %xmm0
-; CANNONLAKE-NEXT: retq # sched: [1:1.00]
+; CANNONLAKE-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sha256msg1:
; ZNVER1: # BB#0:
@@ -184,7 +184,7 @@ define <4 x i32> @test_sha256msg2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2)
; CANNONLAKE: # BB#0:
; CANNONLAKE-NEXT: sha256msg2 %xmm1, %xmm0
; CANNONLAKE-NEXT: sha256msg2 (%rdi), %xmm0
-; CANNONLAKE-NEXT: retq # sched: [1:1.00]
+; CANNONLAKE-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sha256msg2:
; ZNVER1: # BB#0:
@@ -224,7 +224,7 @@ define <4 x i32> @test_sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2,
; CANNONLAKE-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3
; CANNONLAKE-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3
; CANNONLAKE-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:1.00]
-; CANNONLAKE-NEXT: retq # sched: [1:1.00]
+; CANNONLAKE-NEXT: retq # sched: [2:1.00]
;
; ZNVER1-LABEL: test_sha256rnds2:
; ZNVER1: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll
index 2ddefa16e1d..8eb7b3f3554 100644
--- a/llvm/test/CodeGen/X86/sse-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse-schedule.ll
@@ -37,8 +37,8 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_addps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addps:
; BTVER2: # BB#0:
@@ -85,8 +85,8 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
; HASWELL-LABEL: test_addss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addss:
; BTVER2: # BB#0:
@@ -137,8 +137,8 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_andps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andps:
; BTVER2: # BB#0:
@@ -193,8 +193,8 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; HASWELL-LABEL: test_andnotps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotps:
; BTVER2: # BB#0:
@@ -251,9 +251,9 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_cmpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpps:
; BTVER2: # BB#0:
@@ -306,7 +306,7 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpss:
; BTVER2: # BB#0:
@@ -399,7 +399,7 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_comiss:
; BTVER2: # BB#0:
@@ -470,7 +470,7 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
; HASWELL-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2ss:
; BTVER2: # BB#0:
@@ -523,10 +523,10 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_cvtsi2ssq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
; HASWELL-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2ssq:
; BTVER2: # BB#0:
@@ -580,9 +580,9 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
; HASWELL-LABEL: test_cvtss2si:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [4:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtss2si:
; BTVER2: # BB#0:
@@ -639,9 +639,9 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
; HASWELL-LABEL: test_cvtss2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtss2siq:
; BTVER2: # BB#0:
@@ -698,9 +698,9 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
; HASWELL-LABEL: test_cvttss2si:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [4:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttss2si:
; BTVER2: # BB#0:
@@ -754,9 +754,9 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
; HASWELL-LABEL: test_cvttss2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttss2siq:
; BTVER2: # BB#0:
@@ -805,9 +805,9 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
;
; HASWELL-LABEL: test_divps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divps:
; BTVER2: # BB#0:
@@ -853,9 +853,9 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
;
; HASWELL-LABEL: test_divss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divss:
; BTVER2: # BB#0:
@@ -902,8 +902,8 @@ define void @test_ldmxcsr(i32 %a0) {
; HASWELL-LABEL: test_ldmxcsr:
; HASWELL: # BB#0:
; HASWELL-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [6:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ldmxcsr:
; BTVER2: # BB#0:
@@ -952,8 +952,8 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_maxps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxps:
; BTVER2: # BB#0:
@@ -1001,8 +1001,8 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_maxss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxss:
; BTVER2: # BB#0:
@@ -1050,8 +1050,8 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_minps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minps:
; BTVER2: # BB#0:
@@ -1099,8 +1099,8 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_minss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minss:
; BTVER2: # BB#0:
@@ -1151,10 +1151,10 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
;
; HASWELL-LABEL: test_movaps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movaps:
; BTVER2: # BB#0:
@@ -1207,7 +1207,7 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
; HASWELL-LABEL: test_movhlps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movhlps:
; BTVER2: # BB#0:
@@ -1257,10 +1257,10 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; HASWELL-LABEL: test_movhps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movhps:
; BTVER2: # BB#0:
@@ -1316,7 +1316,7 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movlhps:
; BTVER2: # BB#0:
@@ -1365,10 +1365,10 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
;
; HASWELL-LABEL: test_movlps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movlps:
; BTVER2: # BB#0:
@@ -1419,7 +1419,7 @@ define i32 @test_movmskps(<4 x float> %a0) {
; HASWELL-LABEL: test_movmskps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskps:
; BTVER2: # BB#0:
@@ -1465,7 +1465,7 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_movntps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntps:
; BTVER2: # BB#0:
@@ -1511,10 +1511,10 @@ define void @test_movss_mem(float* %a0, float* %a1) {
;
; HASWELL-LABEL: test_movss_mem:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movss_mem:
; BTVER2: # BB#0:
@@ -1565,7 +1565,7 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
; HASWELL-LABEL: test_movss_reg:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movss_reg:
; BTVER2: # BB#0:
@@ -1611,10 +1611,10 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
;
; HASWELL-LABEL: test_movups:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movups:
; BTVER2: # BB#0:
@@ -1663,8 +1663,8 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_mulps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulps:
; BTVER2: # BB#0:
@@ -1711,8 +1711,8 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
; HASWELL-LABEL: test_mulss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulss:
; BTVER2: # BB#0:
@@ -1763,8 +1763,8 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
; HASWELL-LABEL: test_orps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_orps:
; BTVER2: # BB#0:
@@ -1816,8 +1816,8 @@ define void @test_prefetchnta(i8* %a0) {
;
; HASWELL-LABEL: test_prefetchnta:
; HASWELL: # BB#0:
-; HASWELL-NEXT: prefetchnta (%rdi) # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: prefetchnta (%rdi) # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_prefetchnta:
; BTVER2: # BB#0:
@@ -1867,9 +1867,9 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_rcpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rcpps:
; BTVER2: # BB#0:
@@ -1929,11 +1929,11 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
;
; HASWELL-LABEL: test_rcpss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rcpss:
; BTVER2: # BB#0:
@@ -1994,9 +1994,9 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_rsqrtps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rsqrtps:
; BTVER2: # BB#0:
@@ -2057,10 +2057,10 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
; HASWELL-LABEL: test_rsqrtss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_rsqrtss:
; BTVER2: # BB#0:
@@ -2116,8 +2116,8 @@ define void @test_sfence() {
;
; HASWELL-LABEL: test_sfence:
; HASWELL: # BB#0:
-; HASWELL-NEXT: sfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: sfence # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sfence:
; BTVER2: # BB#0:
@@ -2165,8 +2165,8 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; HASWELL-LABEL: test_shufps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufps:
; BTVER2: # BB#0:
@@ -2217,10 +2217,10 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
;
; HASWELL-LABEL: test_sqrtps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [14:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtps:
; BTVER2: # BB#0:
@@ -2280,11 +2280,11 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
;
; HASWELL-LABEL: test_sqrtss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [1:0.50]
+; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtss:
; BTVER2: # BB#0:
@@ -2336,9 +2336,9 @@ define i32 @test_stmxcsr() {
;
; HASWELL-LABEL: test_stmxcsr:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
-; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_stmxcsr:
; BTVER2: # BB#0:
@@ -2387,8 +2387,8 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_subps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subps:
; BTVER2: # BB#0:
@@ -2435,8 +2435,8 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
; HASWELL-LABEL: test_subss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subss:
; BTVER2: # BB#0:
@@ -2524,7 +2524,7 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ucomiss:
; BTVER2: # BB#0:
@@ -2593,8 +2593,8 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; HASWELL-LABEL: test_unpckhps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhps:
; BTVER2: # BB#0:
@@ -2645,8 +2645,8 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; HASWELL-LABEL: test_unpcklps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklps:
; BTVER2: # BB#0:
@@ -2697,8 +2697,8 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; HASWELL-LABEL: test_xorps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorps:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse2-schedule.ll b/llvm/test/CodeGen/X86/sse2-schedule.ll
index 28d8b1888b4..23533a7bf53 100644
--- a/llvm/test/CodeGen/X86/sse2-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse2-schedule.ll
@@ -37,8 +37,8 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_addpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addpd:
; BTVER2: # BB#0:
@@ -85,8 +85,8 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
; HASWELL-LABEL: test_addsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsd:
; BTVER2: # BB#0:
@@ -137,9 +137,9 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_andpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andpd:
; BTVER2: # BB#0:
@@ -197,9 +197,9 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; HASWELL-LABEL: test_andnotpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_andnotpd:
; BTVER2: # BB#0:
@@ -259,9 +259,9 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_cmppd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmppd:
; BTVER2: # BB#0:
@@ -314,7 +314,7 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cmpsd:
; BTVER2: # BB#0:
@@ -407,7 +407,7 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_comisd:
; BTVER2: # BB#0:
@@ -476,9 +476,9 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; HASWELL-LABEL: test_cvtdq2pd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [4:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2pd:
; BTVER2: # BB#0:
@@ -534,10 +534,10 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
;
; HASWELL-LABEL: test_cvtdq2ps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [3:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtdq2ps:
; BTVER2: # BB#0:
@@ -592,9 +592,9 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
; HASWELL-LABEL: test_cvtpd2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2dq:
; BTVER2: # BB#0:
@@ -650,9 +650,9 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
; HASWELL-LABEL: test_cvtpd2ps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtpd2ps:
; BTVER2: # BB#0:
@@ -708,9 +708,9 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_cvtps2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [3:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtps2dq:
; BTVER2: # BB#0:
@@ -766,9 +766,9 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_cvtps2pd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtps2pd:
; BTVER2: # BB#0:
@@ -824,9 +824,9 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
; HASWELL-LABEL: test_cvtsd2si:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [4:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsd2si:
; BTVER2: # BB#0:
@@ -883,9 +883,9 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
; HASWELL-LABEL: test_cvtsd2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsd2siq:
; BTVER2: # BB#0:
@@ -947,10 +947,10 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
; HASWELL-LABEL: test_cvtsd2ss:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50]
; HASWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsd2ss:
; BTVER2: # BB#0:
@@ -1008,7 +1008,7 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
; HASWELL-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2sd:
; BTVER2: # BB#0:
@@ -1064,7 +1064,7 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
; HASWELL-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtsi2sdq:
; BTVER2: # BB#0:
@@ -1125,10 +1125,10 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
; HASWELL-LABEL: test_cvtss2sd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvtss2sd:
; BTVER2: # BB#0:
@@ -1185,9 +1185,9 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
; HASWELL-LABEL: test_cvttpd2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttpd2dq:
; BTVER2: # BB#0:
@@ -1244,9 +1244,9 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_cvttps2dq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [3:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttps2dq:
; BTVER2: # BB#0:
@@ -1300,9 +1300,9 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
; HASWELL-LABEL: test_cvttsd2si:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [4:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttsd2si:
; BTVER2: # BB#0:
@@ -1356,9 +1356,9 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
; HASWELL-LABEL: test_cvttsd2siq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [4:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_cvttsd2siq:
; BTVER2: # BB#0:
@@ -1407,9 +1407,9 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
;
; HASWELL-LABEL: test_divpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divpd:
; BTVER2: # BB#0:
@@ -1455,9 +1455,9 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
;
; HASWELL-LABEL: test_divsd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_divsd:
; BTVER2: # BB#0:
@@ -1505,8 +1505,8 @@ define void @test_lfence() {
;
; HASWELL-LABEL: test_lfence:
; HASWELL: # BB#0:
-; HASWELL-NEXT: lfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: lfence # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lfence:
; BTVER2: # BB#0:
@@ -1551,8 +1551,8 @@ define void @test_mfence() {
;
; HASWELL-LABEL: test_mfence:
; HASWELL: # BB#0:
-; HASWELL-NEXT: mfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: mfence # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mfence:
; BTVER2: # BB#0:
@@ -1595,8 +1595,8 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
;
; HASWELL-LABEL: test_maskmovdqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maskmovdqu:
; BTVER2: # BB#0:
@@ -1640,8 +1640,8 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_maxpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxpd:
; BTVER2: # BB#0:
@@ -1689,8 +1689,8 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_maxsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_maxsd:
; BTVER2: # BB#0:
@@ -1738,8 +1738,8 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_minpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minpd:
; BTVER2: # BB#0:
@@ -1787,8 +1787,8 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_minsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_minsd:
; BTVER2: # BB#0:
@@ -1839,10 +1839,10 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
;
; HASWELL-LABEL: test_movapd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movapd:
; BTVER2: # BB#0:
@@ -1894,10 +1894,10 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; HASWELL-LABEL: test_movdqa:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movdqa:
; BTVER2: # BB#0:
@@ -1949,10 +1949,10 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
;
; HASWELL-LABEL: test_movdqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movdqu:
; BTVER2: # BB#0:
@@ -2017,12 +2017,12 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; HASWELL-LABEL: test_movd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovd %xmm0, %eax # sched: [1:1.00]
; HASWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movd:
; BTVER2: # BB#0:
@@ -2098,12 +2098,12 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
; HASWELL-LABEL: test_movd_64:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovq %xmm0, %rax # sched: [1:1.00]
; HASWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movd_64:
; BTVER2: # BB#0:
@@ -2166,10 +2166,10 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; HASWELL-LABEL: test_movhpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movhpd:
; BTVER2: # BB#0:
@@ -2224,10 +2224,10 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
;
; HASWELL-LABEL: test_movlpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movlpd:
; BTVER2: # BB#0:
@@ -2277,7 +2277,7 @@ define i32 @test_movmskpd(<2 x double> %a0) {
; HASWELL-LABEL: test_movmskpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movmskpd:
; BTVER2: # BB#0:
@@ -2324,7 +2324,7 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntdqa:
; BTVER2: # BB#0:
@@ -2371,7 +2371,7 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntpd:
; BTVER2: # BB#0:
@@ -2420,10 +2420,10 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
;
; HASWELL-LABEL: test_movq_mem:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movq_mem:
; BTVER2: # BB#0:
@@ -2477,7 +2477,7 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
; HASWELL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movq_reg:
; BTVER2: # BB#0:
@@ -2526,10 +2526,10 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
;
; HASWELL-LABEL: test_movsd_mem:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50]
; HASWELL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsd_mem:
; BTVER2: # BB#0:
@@ -2581,7 +2581,7 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
; HASWELL-LABEL: test_movsd_reg:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsd_reg:
; BTVER2: # BB#0:
@@ -2627,10 +2627,10 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
;
; HASWELL-LABEL: test_movupd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movupd:
; BTVER2: # BB#0:
@@ -2679,8 +2679,8 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_mulpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulpd:
; BTVER2: # BB#0:
@@ -2727,8 +2727,8 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
; HASWELL-LABEL: test_mulsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mulsd:
; BTVER2: # BB#0:
@@ -2779,9 +2779,9 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_orpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_orpd:
; BTVER2: # BB#0:
@@ -2839,8 +2839,8 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_packssdw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packssdw:
; BTVER2: # BB#0:
@@ -2893,8 +2893,8 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_packsswb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packsswb:
; BTVER2: # BB#0:
@@ -2947,8 +2947,8 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_packuswb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packuswb:
; BTVER2: # BB#0:
@@ -3001,8 +3001,8 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_paddb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddb:
; BTVER2: # BB#0:
@@ -3053,8 +3053,8 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_paddd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddd:
; BTVER2: # BB#0:
@@ -3101,8 +3101,8 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_paddq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddq:
; BTVER2: # BB#0:
@@ -3153,8 +3153,8 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_paddsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddsb:
; BTVER2: # BB#0:
@@ -3206,8 +3206,8 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_paddsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddsw:
; BTVER2: # BB#0:
@@ -3259,8 +3259,8 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_paddusb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddusb:
; BTVER2: # BB#0:
@@ -3312,8 +3312,8 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_paddusw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddusw:
; BTVER2: # BB#0:
@@ -3365,8 +3365,8 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_paddw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_paddw:
; BTVER2: # BB#0:
@@ -3417,9 +3417,9 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_pand:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pand:
; BTVER2: # BB#0:
@@ -3479,9 +3479,9 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_pandn:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pandn:
; BTVER2: # BB#0:
@@ -3537,8 +3537,8 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pavgb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pavgb:
; BTVER2: # BB#0:
@@ -3590,8 +3590,8 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pavgw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pavgw:
; BTVER2: # BB#0:
@@ -3645,9 +3645,9 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pcmpeqb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqb:
; BTVER2: # BB#0:
@@ -3704,9 +3704,9 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pcmpeqd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqd:
; BTVER2: # BB#0:
@@ -3763,9 +3763,9 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pcmpeqw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqw:
; BTVER2: # BB#0:
@@ -3823,9 +3823,9 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pcmpgtb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtb:
; BTVER2: # BB#0:
@@ -3883,9 +3883,9 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pcmpgtd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtd:
; BTVER2: # BB#0:
@@ -3943,9 +3943,9 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pcmpgtw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtw:
; BTVER2: # BB#0:
@@ -3995,9 +3995,9 @@ define i16 @test_pextrw(<8 x i16> %a0) {
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [2:1.00]
; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrw:
; BTVER2: # BB#0:
@@ -4045,9 +4045,9 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
;
; HASWELL-LABEL: test_pinsrw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrw:
; BTVER2: # BB#0:
@@ -4102,8 +4102,8 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pmaddwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaddwd:
; BTVER2: # BB#0:
@@ -4156,8 +4156,8 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pmaxsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxsw:
; BTVER2: # BB#0:
@@ -4209,8 +4209,8 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pmaxub:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxub:
; BTVER2: # BB#0:
@@ -4262,8 +4262,8 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pminsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminsw:
; BTVER2: # BB#0:
@@ -4315,8 +4315,8 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pminub:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminub:
; BTVER2: # BB#0:
@@ -4362,7 +4362,7 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
; HASWELL-LABEL: test_pmovmskb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovmskb:
; BTVER2: # BB#0:
@@ -4406,8 +4406,8 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pmulhuw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulhuw:
; BTVER2: # BB#0:
@@ -4455,8 +4455,8 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pmulhw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulhw:
; BTVER2: # BB#0:
@@ -4504,8 +4504,8 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pmullw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmullw:
; BTVER2: # BB#0:
@@ -4560,8 +4560,8 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pmuludq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmuludq:
; BTVER2: # BB#0:
@@ -4614,9 +4614,9 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_por:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_por:
; BTVER2: # BB#0:
@@ -4674,8 +4674,8 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_psadbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psadbw:
; BTVER2: # BB#0:
@@ -4730,9 +4730,9 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
; HASWELL-LABEL: test_pshufd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
-; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshufd:
; BTVER2: # BB#0:
@@ -4788,9 +4788,9 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
; HASWELL-LABEL: test_pshufhw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
-; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshufhw:
; BTVER2: # BB#0:
@@ -4846,9 +4846,9 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
; HASWELL-LABEL: test_pshuflw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
-; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00]
+; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshuflw:
; BTVER2: # BB#0:
@@ -4902,9 +4902,9 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pslld:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pslld:
; BTVER2: # BB#0:
@@ -4958,7 +4958,7 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
; HASWELL-LABEL: test_pslldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pslldq:
; BTVER2: # BB#0:
@@ -5005,9 +5005,9 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_psllq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psllq:
; BTVER2: # BB#0:
@@ -5063,9 +5063,9 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_psllw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psllw:
; BTVER2: # BB#0:
@@ -5121,9 +5121,9 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_psrad:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrad:
; BTVER2: # BB#0:
@@ -5179,9 +5179,9 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_psraw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psraw:
; BTVER2: # BB#0:
@@ -5237,9 +5237,9 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_psrld:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrld:
; BTVER2: # BB#0:
@@ -5293,7 +5293,7 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
; HASWELL-LABEL: test_psrldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrldq:
; BTVER2: # BB#0:
@@ -5340,9 +5340,9 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_psrlq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrlq:
; BTVER2: # BB#0:
@@ -5398,9 +5398,9 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_psrlw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psrlw:
; BTVER2: # BB#0:
@@ -5456,8 +5456,8 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_psubb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubb:
; BTVER2: # BB#0:
@@ -5508,8 +5508,8 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_psubd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubd:
; BTVER2: # BB#0:
@@ -5556,8 +5556,8 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_psubq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubq:
; BTVER2: # BB#0:
@@ -5608,8 +5608,8 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_psubsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubsb:
; BTVER2: # BB#0:
@@ -5661,8 +5661,8 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_psubsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubsw:
; BTVER2: # BB#0:
@@ -5714,8 +5714,8 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_psubusb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubusb:
; BTVER2: # BB#0:
@@ -5767,8 +5767,8 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_psubusw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubusw:
; BTVER2: # BB#0:
@@ -5820,8 +5820,8 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_psubw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psubw:
; BTVER2: # BB#0:
@@ -5872,8 +5872,8 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_punpckhbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhbw:
; BTVER2: # BB#0:
@@ -5926,9 +5926,9 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_punpckhdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhdq:
; BTVER2: # BB#0:
@@ -5982,9 +5982,9 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; HASWELL-LABEL: test_punpckhqdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhqdq:
; BTVER2: # BB#0:
@@ -6038,8 +6038,8 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_punpckhwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckhwd:
; BTVER2: # BB#0:
@@ -6090,8 +6090,8 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_punpcklbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpcklbw:
; BTVER2: # BB#0:
@@ -6144,9 +6144,9 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_punpckldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpckldq:
; BTVER2: # BB#0:
@@ -6200,9 +6200,9 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
; HASWELL-LABEL: test_punpcklqdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpcklqdq:
; BTVER2: # BB#0:
@@ -6256,8 +6256,8 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_punpcklwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_punpcklwd:
; BTVER2: # BB#0:
@@ -6308,9 +6308,9 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_pxor:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pxor:
; BTVER2: # BB#0:
@@ -6364,9 +6364,9 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; HASWELL-LABEL: test_shufpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_shufpd:
; BTVER2: # BB#0:
@@ -6420,10 +6420,10 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
;
; HASWELL-LABEL: test_sqrtpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [21:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtpd:
; BTVER2: # BB#0:
@@ -6483,11 +6483,11 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
;
; HASWELL-LABEL: test_sqrtsd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [1:0.50]
+; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_sqrtsd:
; BTVER2: # BB#0:
@@ -6540,8 +6540,8 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_subpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subpd:
; BTVER2: # BB#0:
@@ -6588,8 +6588,8 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
; HASWELL-LABEL: test_subsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_subsd:
; BTVER2: # BB#0:
@@ -6677,7 +6677,7 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ucomisd:
; BTVER2: # BB#0:
@@ -6746,9 +6746,9 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; HASWELL-LABEL: test_unpckhpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpckhpd:
; BTVER2: # BB#0:
@@ -6808,9 +6808,9 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; HASWELL-LABEL: test_unpcklpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_unpcklpd:
; BTVER2: # BB#0:
@@ -6864,9 +6864,9 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_xorpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_xorpd:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse3-schedule.ll b/llvm/test/CodeGen/X86/sse3-schedule.ll
index f346d4fa15a..ee15f0e7df6 100644
--- a/llvm/test/CodeGen/X86/sse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse3-schedule.ll
@@ -37,8 +37,8 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; HASWELL-LABEL: test_addsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubpd:
; BTVER2: # BB#0:
@@ -86,8 +86,8 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; HASWELL-LABEL: test_addsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_addsubps:
; BTVER2: # BB#0:
@@ -135,8 +135,8 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; HASWELL-LABEL: test_haddpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddpd:
; BTVER2: # BB#0:
@@ -184,8 +184,8 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; HASWELL-LABEL: test_haddps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_haddps:
; BTVER2: # BB#0:
@@ -233,8 +233,8 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; HASWELL-LABEL: test_hsubpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubpd:
; BTVER2: # BB#0:
@@ -282,8 +282,8 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; HASWELL-LABEL: test_hsubps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_hsubps:
; BTVER2: # BB#0:
@@ -328,8 +328,8 @@ define <16 x i8> @test_lddqu(i8* %a0) {
;
; HASWELL-LABEL: test_lddqu:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_lddqu:
; BTVER2: # BB#0:
@@ -379,7 +379,7 @@ define void @test_monitor(i8* %a0, i32 %a1, i32 %a2) {
; HASWELL-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
; HASWELL-NEXT: movl %esi, %ecx # sched: [1:0.25]
; HASWELL-NEXT: monitor # sched: [100:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_monitor:
; BTVER2: # BB#0:
@@ -432,9 +432,9 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
; HASWELL-LABEL: test_movddup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
+; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [1:0.50]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movddup:
; BTVER2: # BB#0:
@@ -489,9 +489,9 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_movshdup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [1:0.50]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movshdup:
; BTVER2: # BB#0:
@@ -546,9 +546,9 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
; HASWELL-LABEL: test_movsldup:
; HASWELL: # BB#0:
; HASWELL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [1:0.50]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movsldup:
; BTVER2: # BB#0:
@@ -603,8 +603,8 @@ define void @test_mwait(i32 %a0, i32 %a1) {
; HASWELL: # BB#0:
; HASWELL-NEXT: movl %edi, %ecx # sched: [1:0.25]
; HASWELL-NEXT: movl %esi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: mwait # sched: [100:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: mwait # sched: [20:2.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mwait:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse41-schedule.ll b/llvm/test/CodeGen/X86/sse41-schedule.ll
index 31bf4dd8567..117fae1937f 100644
--- a/llvm/test/CodeGen/X86/sse41-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse41-schedule.ll
@@ -34,8 +34,8 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendpd:
; BTVER2: # BB#0:
@@ -79,8 +79,8 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
; HASWELL-LABEL: test_blendps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
-; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendps:
; BTVER2: # BB#0:
@@ -127,8 +127,8 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; HASWELL-LABEL: test_blendvpd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvpd:
; BTVER2: # BB#0:
@@ -176,8 +176,8 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; HASWELL-LABEL: test_blendvps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_blendvps:
; BTVER2: # BB#0:
@@ -219,8 +219,8 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; HASWELL-LABEL: test_dppd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_dppd:
; BTVER2: # BB#0:
@@ -262,8 +262,8 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
; HASWELL-LABEL: test_dpps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [14:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_dpps:
; BTVER2: # BB#0:
@@ -305,8 +305,8 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
; HASWELL-LABEL: test_insertps:
; HASWELL: # BB#0:
; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_insertps:
; BTVER2: # BB#0:
@@ -344,8 +344,8 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
;
; HASWELL-LABEL: test_movntdqa:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_movntdqa:
; BTVER2: # BB#0:
@@ -382,9 +382,9 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; HASWELL-LABEL: test_mpsadbw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:2.00]
+; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_mpsadbw:
; BTVER2: # BB#0:
@@ -427,8 +427,8 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_packusdw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_packusdw:
; BTVER2: # BB#0:
@@ -477,8 +477,8 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
; HASWELL-LABEL: test_pblendvb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pblendvb:
; BTVER2: # BB#0:
@@ -521,7 +521,7 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pblendw:
; BTVER2: # BB#0:
@@ -562,8 +562,8 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-LABEL: test_pcmpeqq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpeqq:
; BTVER2: # BB#0:
@@ -605,9 +605,9 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
;
; HASWELL-LABEL: test_pextrb:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrb:
; BTVER2: # BB#0:
@@ -648,9 +648,9 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
;
; HASWELL-LABEL: test_pextrd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrd:
; BTVER2: # BB#0:
@@ -690,9 +690,9 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
;
; HASWELL-LABEL: test_pextrq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrq:
; BTVER2: # BB#0:
@@ -732,9 +732,9 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
;
; HASWELL-LABEL: test_pextrw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pextrw:
; BTVER2: # BB#0:
@@ -775,9 +775,9 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
;
; HASWELL-LABEL: test_phminposuw:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [5:1.00]
; HASWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phminposuw:
; BTVER2: # BB#0:
@@ -818,9 +818,9 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
;
; HASWELL-LABEL: test_pinsrb:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrb:
; BTVER2: # BB#0:
@@ -860,9 +860,9 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
;
; HASWELL-LABEL: test_pinsrd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrd:
; BTVER2: # BB#0:
@@ -905,10 +905,10 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
;
; HASWELL-LABEL: test_pinsrq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pinsrq:
; BTVER2: # BB#0:
@@ -952,8 +952,8 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pmaxsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxsb:
; BTVER2: # BB#0:
@@ -995,8 +995,8 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pmaxsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxsd:
; BTVER2: # BB#0:
@@ -1038,8 +1038,8 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pmaxud:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxud:
; BTVER2: # BB#0:
@@ -1081,8 +1081,8 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pmaxuw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaxuw:
; BTVER2: # BB#0:
@@ -1124,8 +1124,8 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pminsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminsb:
; BTVER2: # BB#0:
@@ -1167,8 +1167,8 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pminsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminsd:
; BTVER2: # BB#0:
@@ -1210,8 +1210,8 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pminud:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminud:
; BTVER2: # BB#0:
@@ -1253,8 +1253,8 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pminuw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pminuw:
; BTVER2: # BB#0:
@@ -1300,9 +1300,9 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; HASWELL-LABEL: test_pmovsxbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxbw:
; BTVER2: # BB#0:
@@ -1351,9 +1351,9 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; HASWELL-LABEL: test_pmovsxbd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxbd:
; BTVER2: # BB#0:
@@ -1402,9 +1402,9 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; HASWELL-LABEL: test_pmovsxbq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxbq:
; BTVER2: # BB#0:
@@ -1453,9 +1453,9 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; HASWELL-LABEL: test_pmovsxdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxdq:
; BTVER2: # BB#0:
@@ -1504,9 +1504,9 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; HASWELL-LABEL: test_pmovsxwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxwd:
; BTVER2: # BB#0:
@@ -1555,9 +1555,9 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; HASWELL-LABEL: test_pmovsxwq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovsxwq:
; BTVER2: # BB#0:
@@ -1606,9 +1606,9 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; HASWELL-LABEL: test_pmovzxbw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [1:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxbw:
; BTVER2: # BB#0:
@@ -1657,9 +1657,9 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; HASWELL-LABEL: test_pmovzxbd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxbd:
; BTVER2: # BB#0:
@@ -1708,9 +1708,9 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; HASWELL-LABEL: test_pmovzxbq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxbq:
; BTVER2: # BB#0:
@@ -1759,9 +1759,9 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; HASWELL-LABEL: test_pmovzxdq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxdq:
; BTVER2: # BB#0:
@@ -1810,9 +1810,9 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; HASWELL-LABEL: test_pmovzxwd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [1:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxwd:
; BTVER2: # BB#0:
@@ -1861,9 +1861,9 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; HASWELL-LABEL: test_pmovzxwq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [1:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmovzxwq:
; BTVER2: # BB#0:
@@ -1908,8 +1908,8 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_pmuldq:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmuldq:
; BTVER2: # BB#0:
@@ -1953,7 +1953,7 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00]
; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulld:
; BTVER2: # BB#0:
@@ -2011,7 +2011,7 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL-NEXT: setb %cl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_ptest:
; BTVER2: # BB#0:
@@ -2065,10 +2065,10 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
;
; HASWELL-LABEL: test_roundpd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [5:1.25]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [6:2.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundpd:
; BTVER2: # BB#0:
@@ -2116,10 +2116,10 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
;
; HASWELL-LABEL: test_roundps:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [5:1.25]
+; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [6:2.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundps:
; BTVER2: # BB#0:
@@ -2168,10 +2168,10 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
;
; HASWELL-LABEL: test_roundsd:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25]
+; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundsd:
; BTVER2: # BB#0:
@@ -2220,10 +2220,10 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
;
; HASWELL-LABEL: test_roundss:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25]
+; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_roundss:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/sse42-schedule.ll b/llvm/test/CodeGen/X86/sse42-schedule.ll
index d174ac3e534..d25be4b2abd 100644
--- a/llvm/test/CodeGen/X86/sse42-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse42-schedule.ll
@@ -35,7 +35,7 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_32_8:
; BTVER2: # BB#0:
@@ -84,7 +84,7 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
; HASWELL-NEXT: crc32w %si, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_32_16:
; BTVER2: # BB#0:
@@ -133,7 +133,7 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
; HASWELL-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_32_32:
; BTVER2: # BB#0:
@@ -182,7 +182,7 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_64_8:
; BTVER2: # BB#0:
@@ -231,7 +231,7 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
; HASWELL-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; HASWELL-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: crc32_64_64:
; BTVER2: # BB#0:
@@ -297,14 +297,14 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00]
; HASWELL-NEXT: movl %ecx, %esi # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [18:4.00]
; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; HASWELL-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpestri:
; BTVER2: # BB#0:
@@ -374,11 +374,11 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [19:4.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpestrm:
; BTVER2: # BB#0:
@@ -441,7 +441,7 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00]
; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; HASWELL-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpistri:
; BTVER2: # BB#0:
@@ -489,9 +489,9 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
;
; HASWELL-LABEL: test_pcmpistrm:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpistrm:
; BTVER2: # BB#0:
@@ -534,7 +534,7 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; HASWELL: # BB#0:
; HASWELL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; HASWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pcmpgtq:
; BTVER2: # BB#0:
@@ -576,9 +576,9 @@ define <2 x i64> @test_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
;
; HASWELL-LABEL: test_pclmulqdq:
; HASWELL: # BB#0:
-; HASWELL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [7:2.00]
-; HASWELL-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pclmulqdq:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/ssse3-schedule.ll b/llvm/test/CodeGen/X86/ssse3-schedule.ll
index 86ce4c7b2d9..acbc2cf9917 100644
--- a/llvm/test/CodeGen/X86/ssse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/ssse3-schedule.ll
@@ -42,9 +42,9 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; HASWELL-LABEL: test_pabsb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pabsb:
; BTVER2: # BB#0:
@@ -100,9 +100,9 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
; HASWELL-LABEL: test_pabsd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pabsd:
; BTVER2: # BB#0:
@@ -158,9 +158,9 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
; HASWELL-LABEL: test_pabsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsw (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsw (%rdi), %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pabsw:
; BTVER2: # BB#0:
@@ -216,8 +216,8 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_palignr:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
-; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_palignr:
; BTVER2: # BB#0:
@@ -264,8 +264,8 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_phaddd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phaddd:
; BTVER2: # BB#0:
@@ -313,8 +313,8 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_phaddsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phaddsw:
; BTVER2: # BB#0:
@@ -362,8 +362,8 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_phaddw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phaddw:
; BTVER2: # BB#0:
@@ -411,8 +411,8 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_phsubd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phsubd:
; BTVER2: # BB#0:
@@ -460,8 +460,8 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_phsubsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phsubsw:
; BTVER2: # BB#0:
@@ -509,8 +509,8 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_phsubw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_phsubw:
; BTVER2: # BB#0:
@@ -558,8 +558,8 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pmaddubsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmaddubsw:
; BTVER2: # BB#0:
@@ -608,8 +608,8 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_pmulhrsw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pmulhrsw:
; BTVER2: # BB#0:
@@ -657,8 +657,8 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_pshufb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_pshufb:
; BTVER2: # BB#0:
@@ -710,8 +710,8 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; HASWELL-LABEL: test_psignb:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psignb:
; BTVER2: # BB#0:
@@ -763,8 +763,8 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; HASWELL-LABEL: test_psignd:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psignd:
; BTVER2: # BB#0:
@@ -816,8 +816,8 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; HASWELL-LABEL: test_psignw:
; HASWELL: # BB#0:
; HASWELL-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [2:1.00]
;
; BTVER2-LABEL: test_psignw:
; BTVER2: # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index b69715cba3d..edbf7dfd131 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -201,14 +201,14 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm8
; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm9
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm9, %ymm5, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm8, %ymm8, %ymm9
; AVX512DQ-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4
; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4
@@ -328,14 +328,14 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm8
; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm9
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm9, %ymm5, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm8, %ymm8, %ymm9
; AVX512DQ-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4
; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 72d1defec9f..9b3511e1ad8 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -777,9 +777,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
;
; AVX512DQ-LABEL: splatvar_shift_v32i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 02681b76bb4..b7d284d81f9 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -713,9 +713,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
;
; AVX512DQ-LABEL: splatvar_shift_v32i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 5980d8be5ea..a668fd5b713 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -68,13 +68,13 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5
-; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
-; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
-; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; KNL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm5
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7],ymm0[8],ymm5[9],ymm0[10],ymm5[11],ymm0[12],ymm5[13],ymm0[14],ymm5[15]
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
; KNL-NEXT: vpbroadcastw %xmm3, %ymm3
; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
OpenPOWER on IntegriCloud