diff options
author | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2016-07-06 09:11:49 +0000 |
---|---|---|
committer | Elena Demikhovsky <elena.demikhovsky@intel.com> | 2016-07-06 09:11:49 +0000 |
commit | 02ced295aa8213155517fa1e1c2d33f61d90bf6e (patch) | |
tree | 6d8e53f8fa8e670155f3023491c3eb9a30e6f391 | |
parent | 932ec01328798f575e044e20394eccf562261caa (diff) | |
download | bcm5719-llvm-02ced295aa8213155517fa1e1c2d33f61d90bf6e.tar.gz bcm5719-llvm-02ced295aa8213155517fa1e1c2d33f61d90bf6e.zip |
Reverted 274613 due to compilation failue.
llvm-svn: 274615
-rw-r--r-- | llvm/lib/Target/X86/X86FastISel.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 62 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 66 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-cmp.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-ext.ll | 325 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-insert-extract.ll | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-mask-op.ll | 60 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512dq-intrinsics.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_gather_scatter.ll | 102 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/pr27591.ll | 49 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/pr28173.ll | 20 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/xaluo.ll | 2 |
13 files changed, 329 insertions, 379 deletions
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 1457553e55f..e9aabeba236 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -1404,9 +1404,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!isTypeLegal(I->getOperand(0)->getType(), VT)) return false; - if (I->getType()->isIntegerTy(1) && Subtarget->hasAVX512()) - return false; - // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); unsigned ResultReg = 0; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ddfb8fa96bb..90cbc479cc4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15551,11 +15551,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { - if (VT == MVT::i1) { - NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC, - DAG.getValueType(MVT::i1)); + if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); - } return NewSetCC; } } @@ -15577,11 +15574,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(CCode, dl, MVT::i8), Op0.getOperand(1)); - if (VT == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); + if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); - } return SetCC; } } @@ -15605,11 +15599,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); - if (VT == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); + if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); - } return SetCC; } @@ -15628,11 +15619,8 @@ SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); - if (Op.getSimpleValueType() == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - } + if (Op.getSimpleValueType() == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return SetCC; } @@ -15662,23 +15650,14 @@ static bool isX86LogicalCmp(SDValue Op) { return false; } -/// Returns the "condition" node, that may be wrapped with "truncate". -/// Like this: (i1 (trunc (i8 X86ISD::SETCC))). -static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { +static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) - return V; + return false; SDValue VOp0 = V.getOperand(0); - if (VOp0.getOpcode() == ISD::AssertZext && - V.getValueSizeInBits() == - cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits()) - return VOp0.getOperand(0); - unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); - if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits))) - return V.getOperand(0); - return V; + return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -15901,7 +15880,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (addTest) { // Look past the truncate if the high bits are known zero. - Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -16739,7 +16719,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { // Look pass the truncate if the high bits are known zero. - Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG); + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -17999,7 +17980,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -20513,15 +20494,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); SDValue SetCC = - DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), DAG.getConstant(X86::COND_O, DL, MVT::i32), SDValue(Sum.getNode(), 2)); - if (N->getValueType(1) == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); - SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - } return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } } @@ -20531,15 +20507,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); SDValue SetCC = - DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), DAG.getConstant(Cond, DL, MVT::i32), SDValue(Sum.getNode(), 1)); - - if (N->getValueType(1) == MVT::i1) { - SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC, - DAG.getValueType(MVT::i1)); - SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - } + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } @@ -26899,7 +26870,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || - SetCC.getOpcode() == ISD::AssertZext || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9d040867a38..859f5837df7 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2083,65 +2083,51 @@ let Predicates = [HasBWI] in { (KMOVQkm addr:$src)>; } -def assertzext_i1 : PatFrag<(ops node:$src), (assertzext node:$src), [{ - return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1; -}]>; - let Predicates = [HasAVX512] in { def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND64ri8 $src, (i64 1)), - sub_16bit)), VK1)>; - - def : Pat<(i1 (trunc (i64 (assertzext_i1 GR64:$src)))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; + (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), + (i32 1))), VK1)>; def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG (AND32ri8 $src, (i32 1)), - sub_16bit)), VK1)>; - - def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), - (COPY_TO_REGCLASS (i16 (EXTRACT_SUBREG $src, sub_16bit)), VK1)>; + (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>; def : Pat<(i1 (trunc (i8 GR8:$src))), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), (AND8ri8 $src, (i8 1)), - sub_8bit)), VK1)>; - - def : Pat<(i1 (trunc (i8 (assertzext_i1 GR8:$src)))), - (COPY_TO_REGCLASS (i16 (SUBREG_TO_REG (i64 0), $src, sub_8bit)), VK1)>; - + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))), + VK1)>; def : Pat<(i1 (trunc (i16 GR16:$src))), - (COPY_TO_REGCLASS (i16 (AND16ri8 $src, (i16 1))), VK1)>; - - def : Pat<(i1 (trunc (i16 (assertzext_i1 GR16:$src)))), - (COPY_TO_REGCLASS $src, VK1)>; + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))), + VK1)>; def : Pat<(i32 (zext VK1:$src)), - (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; - + (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i32 (anyext VK1:$src)), - (i32 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>; def : Pat<(i8 (zext VK1:$src)), - (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS VK1:$src, GR16)), sub_8bit))>; - + (EXTRACT_SUBREG + (AND32ri8 (KMOVWrk + (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; def : Pat<(i8 (anyext VK1:$src)), - (i8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS $src, GR16)), sub_8bit))>; + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>; def : Pat<(i64 (zext VK1:$src)), - (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; - + (AND64ri8 (SUBREG_TO_REG (i64 0), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; def : Pat<(i64 (anyext VK1:$src)), - (i64 (SUBREG_TO_REG (i64 0), (i16 (COPY_TO_REGCLASS $src, GR16)), - sub_16bit))>; + (SUBREG_TO_REG (i64 0), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit)>; def : Pat<(i16 (zext VK1:$src)), - (COPY_TO_REGCLASS $src, GR16)>; - + (EXTRACT_SUBREG + (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), + sub_16bit)>; def : Pat<(i16 (anyext VK1:$src)), - (i16 (COPY_TO_REGCLASS $src, GR16))>; + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), + sub_16bit)>; } def : Pat<(v16i1 (scalar_to_vector VK1:$src)), (COPY_TO_REGCLASS VK1:$src, VK16)>; diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll index 52caa0ed5d6..2c0c0a5b8c7 100644 --- a/llvm/test/CodeGen/X86/avx512-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-cmp.ll @@ -163,10 +163,12 @@ define i32 @test10(i64 %b, i64 %c, i1 %d) { ; ALL-NEXT: kmovw %edx, %k0 ; ALL-NEXT: cmpq %rsi, %rdi ; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax ; ALL-NEXT: kmovw %eax, %k1 ; ALL-NEXT: korw %k1, %k0, %k1 ; ALL-NEXT: kxorw %k1, %k0, %k0 ; ALL-NEXT: kmovw %k0, %eax +; ALL-NEXT: andl $1, %eax ; ALL-NEXT: testb %al, %al ; ALL-NEXT: je LBB8_1 ; ALL-NEXT: ## BB#2: ## %if.end.i diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index cb74c598a1a..f5631af34d2 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1513,264 +1513,265 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r14d -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 +; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 ; KNL-NEXT: kshiftlw $0, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %r15d, %xmm4 -; KNL-NEXT: kmovw %k0, %r15d -; KNL-NEXT: kshiftlw $14, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %ecx, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: kshiftlw $15, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %r12d, %xmm4, %xmm4 +; KNL-NEXT: vmovd %eax, %xmm4 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: kshiftlw $13, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r12d -; KNL-NEXT: kshiftlw $12, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %r13d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: kshiftlw $11, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload -; KNL-NEXT: kmovw %k0, %r13d -; KNL-NEXT: kshiftlw $10, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $6, %esi, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %esi -; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill -; KNL-NEXT: kshiftlw $9, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %edi, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %esi -; KNL-NEXT: kshiftlw $8, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %r8d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %edi -; KNL-NEXT: kshiftlw $7, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %r9d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r8d -; KNL-NEXT: kshiftlw $6, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %r10d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r9d -; KNL-NEXT: kshiftlw $5, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %r11d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r10d -; KNL-NEXT: kshiftlw $4, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $12, %ebx, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %ebx -; KNL-NEXT: kshiftlw $3, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $13, %ebp, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %ebp -; KNL-NEXT: kshiftlw $2, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $14, %r14d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r11d -; KNL-NEXT: kshiftlw $1, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $15, %r15d, %xmm4, %xmm4 -; KNL-NEXT: kmovw %k0, %r14d -; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 -; KNL-NEXT: kshiftlw $0, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %eax, %xmm5 -; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %r12d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %edx, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: vpinsrb $3, %edi, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edi ; KNL-NEXT: kshiftlw $12, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $4, %esi, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: kshiftlw $11, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; KNL-NEXT: vpinsrb $5, %r13d, %xmm4, %xmm4 ; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $10, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %esi -; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill +; KNL-NEXT: vpinsrb $6, %r8d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: kshiftlw $9, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: vpinsrb $7, %r10d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: kshiftlw $8, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %edi +; KNL-NEXT: vpinsrb $8, %r11d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r11d ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: vpinsrb $9, %ebx, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ebx ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: vpinsrb $10, %ebp, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %ebp ; KNL-NEXT: kshiftlw $5, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %ebx +; KNL-NEXT: vpinsrb $11, %r14d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: kshiftlw $4, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $12, %ebp, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %ebp +; KNL-NEXT: vpinsrb $12, %r15d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $3, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $13, %r11d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: vpinsrb $13, %r9d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $14, %r14d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %r11d +; KNL-NEXT: vpinsrb $14, %r12d, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r12d ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $15, %r15d, %xmm5, %xmm5 -; KNL-NEXT: kmovw %k0, %r14d -; KNL-NEXT: vptestmd %zmm7, %zmm7, %k0 +; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: vptestmd %zmm6, %zmm6, %k0 ; KNL-NEXT: kshiftlw $0, %k1, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %eax, %xmm6 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: vmovd %ecx, %xmm5 +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $1, %ecx, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: vpinsrb $2, %edi, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $3, %edx, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $4, %r13d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %r8d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: vpinsrb $6, %r10d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %edi +; KNL-NEXT: vpinsrb $7, %r11d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: vpinsrb $8, %ebx, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: vpinsrb $9, %ebp, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $10, %ebx, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: vpinsrb $10, %r14d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $11, %ebp, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: vpinsrb $11, %r15d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $13, %r11d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: vpinsrb $13, %r12d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $14, %r14d, %xmm6, %xmm6 -; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: vpinsrb $14, %r9d, %xmm5, %xmm5 +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6 +; KNL-NEXT: vpinsrb $15, %edx, %xmm5, %xmm5 ; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: vptestmd %zmm7, %zmm7, %k1 ; KNL-NEXT: kshiftlw $0, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %r12d, %xmm7 +; KNL-NEXT: vmovd %eax, %xmm6 +; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $15, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $2, %edi, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $3, %ecx, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $12, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: kshiftlw $11, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $5, %r13d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r13d +; KNL-NEXT: kshiftlw $10, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k0, %edi +; KNL-NEXT: kshiftlw $9, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $7, %ebx, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ebx +; KNL-NEXT: kshiftlw $8, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $8, %ebp, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %ebp +; KNL-NEXT: kshiftlw $7, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $9, %r10d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: kshiftlw $6, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $10, %r11d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r11d +; KNL-NEXT: kshiftlw $5, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $11, %esi, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: kshiftlw $4, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r14d +; KNL-NEXT: kshiftlw $3, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $13, %r9d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: kshiftlw $2, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $14, %r15d, %xmm6, %xmm6 +; KNL-NEXT: kmovw %k0, %r15d +; KNL-NEXT: kshiftlw $1, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $15, %r12d, %xmm6, %xmm6 ; KNL-NEXT: kmovw %k0, %r12d -; KNL-NEXT: vpinsrb $1, %ecx, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $2, %edx, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $3, %r13d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $4, %eax, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $5, %esi, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $6, %edi, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $7, %r8d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $8, %r9d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $9, %ebx, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $11, %r10d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $12, %r11d, %xmm7, %xmm7 -; KNL-NEXT: vpinsrb $13, %r14d, %xmm7, %xmm7 +; KNL-NEXT: kshiftlw $0, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vmovd %edx, %xmm7 +; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $1, %eax, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $2, %ecx, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $3, %r8d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $5, %edi, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $6, %ebx, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $7, %ebp, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $9, %r11d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $10, %esi, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $11, %r14d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $12, %r9d, %xmm7, %xmm7 +; KNL-NEXT: vpinsrb $13, %r15d, %xmm7, %xmm7 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 ; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 @@ -1783,8 +1784,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 ; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 ; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpinsrb $14, %r15d, %xmm7, %xmm4 -; KNL-NEXT: vpinsrb $15, %r12d, %xmm4, %xmm4 +; KNL-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm4 +; KNL-NEXT: vpinsrb $15, %edx, %xmm4, %xmm4 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; KNL-NEXT: vpsllw $15, %ymm4, %ymm4 ; KNL-NEXT: vpsraw $15, %ymm4, %ymm4 diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 44ecad01407..308673bc395 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -159,6 +159,7 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) { ;CHECK-LABEL: test13 ;CHECK: cmpl %esi, %edi ;CHECK: setb %al +;CHECK: andl $1, %eax ;CHECK: kmovw %eax, %k0 ;CHECK: movw $-4 ;CHECK: korw diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index d1961fc96e6..cc8be256732 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -9,7 +9,9 @@ define i32 @test_kortestz(i16 %a0, i16 %a1) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: kortestw %k0, %k1 ; CHECK-NEXT: sete %al -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) ret i32 %res @@ -5089,6 +5091,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) @@ -5109,6 +5112,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1 ; CHECK-NEXT: kandw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4) @@ -5131,6 +5135,7 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 % ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4) @@ -5148,8 +5153,9 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, ; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k2 {%k1} ; CHECK-NEXT: kmovw %k2, %ecx ; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1} -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: kmovw %k0, %edx +; CHECK-NEXT: kmovw %k1, %edx +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: andb %dl, %al ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 939c338ac9a..f935270d767 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -173,35 +173,18 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: kshiftlw $10, %k0, %k0 ; CHECK-NEXT: kshiftrw $15, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i32 ret i32 %res -} - -define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test2: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kshiftlw $10, %k0, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +}define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i16 ret i16 %res -} - -define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test3: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kshiftlw $10, %k0, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +}define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i8 @@ -596,6 +579,7 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: kshiftlq $5, %k1, %k1 ; SKX-NEXT: korq %k1, %k0, %k0 @@ -1639,10 +1623,10 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %r11d @@ -1669,22 +1653,22 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %r9d, %xmm3 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: vmovd %r10d, %xmm3 +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 ; KNL-NEXT: kshiftlw $0, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2 -; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $2, %r9d, %xmm2, %xmm2 ; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 ; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2 ; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2 @@ -1693,10 +1677,10 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2 ; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2 ; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 ; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, %r10d, %xmm2, %xmm2 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 @@ -1729,7 +1713,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $6, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %esi @@ -1744,7 +1728,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $2, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: vmovd %r10d, %xmm2 @@ -1759,12 +1743,12 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 ; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 @@ -1798,7 +1782,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %esi @@ -1813,7 +1797,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: vmovd %r10d, %xmm1 @@ -1828,12 +1812,12 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 ; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll index af14b6b0d93..434495b9f72 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -490,6 +490,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB28_2 ; CHECK-NEXT: ## BB#1: @@ -497,6 +498,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) { ; CHECK-NEXT: LBB28_2: ; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je LBB28_4 ; CHECK-NEXT: ## BB#3: @@ -519,6 +521,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) { ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB29_2 ; CHECK-NEXT: ## BB#1: @@ -526,6 +529,7 @@ define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) { ; CHECK-NEXT: LBB29_2: ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je LBB29_4 ; CHECK-NEXT: ## BB#3: diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index a8cf982323f..416ccdc68c7 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -1367,9 +1367,12 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; KNL_64-LABEL: test30: ; KNL_64: # BB#0: ; KNL_64-NEXT: andl $1, %edx +; KNL_64-NEXT: kmovw %edx, %k1 ; KNL_64-NEXT: andl $1, %esi +; KNL_64-NEXT: kmovw %esi, %k2 ; KNL_64-NEXT: movl %edi, %eax ; KNL_64-NEXT: andl $1, %eax +; KNL_64-NEXT: kmovw %eax, %k0 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 @@ -1377,76 +1380,81 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; KNL_64-NEXT: testb $1, %dil ; KNL_64-NEXT: je .LBB29_2 ; KNL_64-NEXT: # BB#1: # %cond.load -; KNL_64-NEXT: vmovq %xmm1, %rcx -; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_64-NEXT: vmovq %xmm1, %rax +; KNL_64-NEXT: vmovd (%rax), %xmm0 ; KNL_64-NEXT: .LBB29_2: # %else -; KNL_64-NEXT: testb %sil, %sil +; KNL_64-NEXT: kmovw %k2, %eax +; KNL_64-NEXT: movl %eax, %ecx +; KNL_64-NEXT: andl $1, %ecx +; KNL_64-NEXT: testb %cl, %cl ; KNL_64-NEXT: je .LBB29_4 ; KNL_64-NEXT: # BB#3: # %cond.load1 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 ; KNL_64-NEXT: .LBB29_4: # %else2 +; KNL_64-NEXT: kmovw %k1, %ecx +; KNL_64-NEXT: movl %ecx, %edx +; KNL_64-NEXT: andl $1, %edx ; KNL_64-NEXT: testb %dl, %dl ; KNL_64-NEXT: je .LBB29_6 ; KNL_64-NEXT: # BB#5: # %cond.load4 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL_64-NEXT: vmovq %xmm1, %rcx -; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %xmm1, %rdx +; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 ; KNL_64-NEXT: .LBB29_6: # %else5 -; KNL_64-NEXT: vmovd %eax, %xmm1 -; KNL_64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 -; KNL_64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 +; KNL_64-NEXT: kmovw %k0, %edx +; KNL_64-NEXT: vmovd %edx, %xmm1 +; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test30: ; KNL_32: # BB#0: -; KNL_32-NEXT: pushl %ebx -; KNL_32-NEXT: .Ltmp0: -; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: pushl %esi -; KNL_32-NEXT: .Ltmp1: -; KNL_32-NEXT: .cfi_def_cfa_offset 12 -; KNL_32-NEXT: .Ltmp2: -; KNL_32-NEXT: .cfi_offset %esi, -12 -; KNL_32-NEXT: .Ltmp3: -; KNL_32-NEXT: .cfi_offset %ebx, -8 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: andl $1, %eax -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: andl $1, %eax +; KNL_32-NEXT: kmovw %eax, %k2 +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: movl %eax, %ecx ; KNL_32-NEXT: andl $1, %ecx -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; KNL_32-NEXT: movl %ebx, %edx -; KNL_32-NEXT: andl $1, %edx +; KNL_32-NEXT: kmovw %ecx, %k0 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; KNL_32-NEXT: # implicit-def: %XMM0 -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: je .LBB29_2 ; KNL_32-NEXT: # BB#1: # %cond.load -; KNL_32-NEXT: vmovd %xmm1, %esi -; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_32-NEXT: vmovd %xmm1, %eax +; KNL_32-NEXT: vmovd (%eax), %xmm0 ; KNL_32-NEXT: .LBB29_2: # %else +; KNL_32-NEXT: kmovw %k2, %eax +; KNL_32-NEXT: movl %eax, %ecx +; KNL_32-NEXT: andl $1, %ecx ; KNL_32-NEXT: testb %cl, %cl ; KNL_32-NEXT: je .LBB29_4 ; KNL_32-NEXT: # BB#3: # %cond.load1 -; KNL_32-NEXT: vpextrd $1, %xmm1, %esi -; KNL_32-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0 +; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx +; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 ; KNL_32-NEXT: .LBB29_4: # %else2 -; KNL_32-NEXT: testb %al, %al +; KNL_32-NEXT: kmovw %k1, %ecx +; KNL_32-NEXT: movl %ecx, %edx +; KNL_32-NEXT: andl $1, %edx +; KNL_32-NEXT: testb %dl, %dl ; KNL_32-NEXT: je .LBB29_6 ; KNL_32-NEXT: # BB#5: # %cond.load4 -; KNL_32-NEXT: vpextrd $2, %xmm1, %esi -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 +; KNL_32-NEXT: vpextrd $2, %xmm1, %edx +; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0 ; KNL_32-NEXT: .LBB29_6: # %else5 +; KNL_32-NEXT: kmovw %k0, %edx ; KNL_32-NEXT: vmovd %edx, %xmm1 -; KNL_32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; KNL_32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 -; KNL_32-NEXT: popl %esi -; KNL_32-NEXT: popl %ebx ; KNL_32-NEXT: retl ; ; SKX-LABEL: test30: @@ -1463,7 +1471,7 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX-NEXT: je .LBB29_2 ; SKX-NEXT: # BB#1: # %cond.load ; SKX-NEXT: vmovq %xmm1, %rax -; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: vmovd (%rax), %xmm0 ; SKX-NEXT: .LBB29_2: # %else ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al @@ -1637,12 +1645,12 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_32-LABEL: test_gather_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp4: +; KNL_32-NEXT: .Ltmp0: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp5: +; KNL_32-NEXT: .Ltmp1: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp6: +; KNL_32-NEXT: .Ltmp2: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1760,12 +1768,12 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; KNL_32-LABEL: test_gather_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp7: +; KNL_32-NEXT: .Ltmp3: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp8: +; KNL_32-NEXT: .Ltmp4: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp9: +; KNL_32-NEXT: .Ltmp5: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1877,12 +1885,12 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> % ; KNL_32-LABEL: test_scatter_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp10: +; KNL_32-NEXT: .Ltmp6: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp11: +; KNL_32-NEXT: .Ltmp7: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp12: +; KNL_32-NEXT: .Ltmp8: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1991,12 +1999,12 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou ; KNL_32-LABEL: test_scatter_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Ltmp13: +; KNL_32-NEXT: .Ltmp9: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Ltmp14: +; KNL_32-NEXT: .Ltmp10: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Ltmp15: +; KNL_32-NEXT: .Ltmp11: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp diff --git a/llvm/test/CodeGen/X86/pr27591.ll b/llvm/test/CodeGen/X86/pr27591.ll index 11f5de4956a..bbafe5960d9 100644 --- a/llvm/test/CodeGen/X86/pr27591.ll +++ b/llvm/test/CodeGen/X86/pr27591.ll @@ -3,48 +3,39 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define void @test1(i32 %x) #0 { -; CHECK-LABEL: test1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: setne %al -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: kmovw %ecx, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: andb $1, %al -; CHECK-NEXT: movzbl %al, %edi -; CHECK-NEXT: callq callee1 -; CHECK-NEXT: popq %rax -; CHECK-NEXT: retq entry: %tobool = icmp ne i32 %x, 0 call void @callee1(i1 zeroext %tobool) ret void } +; CHECK-LABEL: test1: +; CHECK: cmpl $0, %edi +; CHECK-NEXT: setne %al +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %edi +; CHECK-NEXT: callq callee1 + define void @test2(i32 %x) #0 { -; CHECK-LABEL: test2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: testl %edi, %edi -; CHECK-NEXT: setne %al -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: kmovw %ecx, %k0 -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: movb %cl, %al -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: movl $-1, %edx -; CHECK-NEXT: cmovnel %edx, %edi -; CHECK-NEXT: callq callee2 -; CHECK-NEXT: popq %rax -; CHECK-NEXT: retq entry: %tobool = icmp ne i32 %x, 0 call void @callee2(i1 signext %tobool) ret void } +; CHECK-LABEL: test2: +; CHECK: cmpl $0, %edi +; CHECK-NEXT: setne %al +; CHECK-NEXT: kmovb %eax, %k0 +; CHECK-NEXT: kmovw %k0, %edi +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: movb %dil, %al +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %edi +; CHECK-NEXT: callq callee2 + declare void @callee1(i1 zeroext) declare void @callee2(i1 signext) diff --git a/llvm/test/CodeGen/X86/pr28173.ll b/llvm/test/CodeGen/X86/pr28173.ll index 7c20d0857d9..81c10bb3757 100644 --- a/llvm/test/CodeGen/X86/pr28173.ll +++ b/llvm/test/CodeGen/X86/pr28173.ll @@ -5,12 +5,12 @@ target triple = "x86_64-unknown-linux-gnu" ; Note that the kmovs should really *not* appear in the output, this is an ; artifact of the current poor lowering. This is tracked by PR28175. +; CHECK-LABEL: @foo64 +; CHECK: kmov +; CHECK: kmov +; CHECK: orq $-2, %rax +; CHECK: ret define i64 @foo64(i1 zeroext %i, i32 %j) #0 { -; CHECK-LABEL: foo64: -; CHECK: # BB#0: -; CHECK-NEXT: orq $-2, %rdi -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: retq br label %bb bb: @@ -22,12 +22,12 @@ end: ret i64 %v } +; CHECK-LABEL: @foo16 +; CHECK: kmov +; CHECK: kmov +; CHECK: orl $65534, %eax +; CHECK: retq define i16 @foo16(i1 zeroext %i, i32 %j) #0 { -; CHECK-LABEL: foo16: -; CHECK: # BB#0: -; CHECK-NEXT: orl $65534, %edi # imm = 0xFFFE -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: retq br label %bb bb: diff --git a/llvm/test/CodeGen/X86/xaluo.ll b/llvm/test/CodeGen/X86/xaluo.ll index 31e18989144..76e00a0993d 100644 --- a/llvm/test/CodeGen/X86/xaluo.ll +++ b/llvm/test/CodeGen/X86/xaluo.ll @@ -738,10 +738,10 @@ define i1 @bug27873(i64 %c1, i1 %c2) { ; KNL-LABEL: bug27873: ; KNL: ## BB#0: ; KNL-NEXT: andl $1, %esi -; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: movl $160, %ecx ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: mulq %rcx +; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: seto %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 |