summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp26
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp36
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td29
-rw-r--r--llvm/lib/Target/X86/X86InstrFoldTables.cpp6
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td57
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll4
-rw-r--r--llvm/test/CodeGen/X86/bitcast-setcc-256.ll16
-rw-r--r--llvm/test/CodeGen/X86/bitcast-setcc-512.ll8
-rw-r--r--llvm/test/CodeGen/X86/bitcast-vector-bool.ll97
-rw-r--r--llvm/test/CodeGen/X86/dagcombine-cse.ll13
-rwxr-xr-xllvm/test/CodeGen/X86/evex-to-vex-compress.mir16
-rw-r--r--llvm/test/CodeGen/X86/fast-isel-fneg.ll13
-rw-r--r--llvm/test/CodeGen/X86/masked_store.ll82
-rw-r--r--llvm/test/CodeGen/X86/movmsk-cmp.ll112
-rw-r--r--llvm/test/CodeGen/X86/peephole.mir40
-rw-r--r--llvm/test/CodeGen/X86/pr41619.ll27
16 files changed, 374 insertions, 208 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a961685ff71..093778add49 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1571,36 +1571,12 @@ bool TargetLowering::SimplifyDemandedBits(
if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
KnownSrcBits, TLO, Depth + 1))
return true;
- } else if ((NumSrcEltBits % BitWidth) == 0 &&
- TLO.DAG.getDataLayout().isLittleEndian()) {
- unsigned Scale = NumSrcEltBits / BitWidth;
- unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
- APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
- APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
- for (unsigned i = 0; i != NumElts; ++i)
- if (DemandedElts[i]) {
- unsigned Offset = (i % Scale) * BitWidth;
- DemandedSrcBits.insertBits(DemandedBits, Offset);
- DemandedSrcElts.setBit(i / Scale);
- }
-
- if (SrcVT.isVector()) {
- APInt KnownSrcUndef, KnownSrcZero;
- if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
- KnownSrcZero, TLO, Depth + 1))
- return true;
- }
-
- KnownBits KnownSrcBits;
- if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
- KnownSrcBits, TLO, Depth + 1))
- return true;
}
// If this is a bitcast, let computeKnownBits handle it. Only do this on a
// recursive call where Known may be useful to the caller.
if (Depth > 0) {
- Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+ Known = TLO.DAG.computeKnownBits(Op, Depth);
return false;
}
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 409fbfa22f3..994e912ac9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3202,44 +3202,30 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHS)
+ if (N->getValueType(0) != MVT::i64)
return SDValue();
- EVT VT = N->getValueType(0);
- SDValue LHS = N->getOperand(0);
- unsigned ShiftAmt = RHS->getZExtValue();
- SelectionDAG &DAG = DCI.DAG;
- SDLoc SL(N);
-
- // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
- // this improves the ability to match BFE patterns in isel.
- if (LHS.getOpcode() == ISD::AND) {
- if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
- if (Mask->getAPIntValue().isShiftedMask() &&
- Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
- return DAG.getNode(
- ISD::AND, SL, VT,
- DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
- DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
- }
- }
- }
-
- if (VT != MVT::i64)
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS)
return SDValue();
+ unsigned ShiftAmt = RHS->getZExtValue();
if (ShiftAmt < 32)
return SDValue();
// srl i64:x, C for C >= 32
// =>
// build_pair (srl hi_32(x), C - 32), 0
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
SDValue One = DAG.getConstant(1, SL, MVT::i32);
SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
- SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
- SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
+ SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+ VecOp, One);
SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index d0d255b6a7f..0165949a7e3 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3832,6 +3832,14 @@ def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
"vmovq\t{$src, $dst|$dst, $src}", []>,
EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecLoad]>;
let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set FR64X:$dst, (bitconvert GR64:$src))]>,
+ EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
+def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>;
def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64X:$src))]>,
@@ -3844,6 +3852,20 @@ def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$
}
} // ExeDomain = SSEPackedInt
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert GR32:$src))]>,
+ EVEX, Sched<[WriteVecMoveFromGpr]>;
+
+def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>,
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
// Move doubleword from xmm register to r/m32
//
let ExeDomain = SSEPackedInt in {
@@ -3860,13 +3882,6 @@ def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt
-let Predicates = [HasAVX512] in {
- def : Pat<(f64 (bitconvert GR64:$src)),
- (COPY_TO_REGCLASS (VMOV64toPQIZrr GR64:$src), FR64X)>;
- def : Pat<(f32 (bitconvert GR32:$src)),
- (COPY_TO_REGCLASS (VMOVDI2PDIZrr GR32:$src), FR32X)>;
-}
-
// Move quadword from xmm1 register to r/m64
//
let ExeDomain = SSEPackedInt in {
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 82adcd8e147..59e62da55f2 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -531,11 +531,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::MOV32rr, X86::MOV32rm, 0 },
{ X86::MOV64rr, X86::MOV64rm, 0 },
{ X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
{ X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUrm, 0 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
@@ -816,6 +818,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VGETMANTPSZrri, X86::VGETMANTPSZrmi, 0 },
{ X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
+ { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 },
+ { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
@@ -833,6 +837,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
+ { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
+ { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index feee34b7644..6a2e5cf1aa2 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4109,6 +4109,11 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}", []>,
VEX, Sched<[WriteVecLoad]>;
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
@@ -4129,9 +4134,38 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"movq\t{$src, $dst|$dst, $src}", []>,
Sched<[WriteVecLoad]>;
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
} // ExeDomain = SSEPackedInt
//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+ def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))]>,
+ VEX, Sched<[WriteVecMoveFromGpr]>;
+
+ def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+ VEX, Sched<[WriteVecLoad]>;
+ def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))]>,
+ Sched<[WriteVecMoveFromGpr]>;
+
+ def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
+ Sched<[WriteVecLoad]>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int to Packed Double Int
//
let ExeDomain = SSEPackedInt in {
@@ -4158,21 +4192,6 @@ def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
Sched<[WriteVecStore]>;
} // ExeDomain = SSEPackedInt
-let Predicates = [UseAVX] in {
- def : Pat<(f64 (bitconvert GR64:$src)),
- (COPY_TO_REGCLASS (VMOV64toPQIrr GR64:$src), FR64)>;
- def : Pat<(f32 (bitconvert GR32:$src)),
- (COPY_TO_REGCLASS (VMOVDI2PDIrr GR32:$src), FR32)>;
-}
-
-let Predicates = [UseSSE2] in
-def : Pat<(f64 (bitconvert GR64:$src)),
- (COPY_TO_REGCLASS (MOV64toPQIrr GR64:$src), FR64)>;
-
-let Predicates = [UseSSE1] in
-def : Pat<(f32 (bitconvert GR32:$src)),
- (COPY_TO_REGCLASS (MOVDI2PDIrr GR32:$src), FR32)>;
-
//===---------------------------------------------------------------------===//
// Move Packed Doubleword Int first element to Doubleword Int
//
@@ -4206,6 +4225,10 @@ def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
//
let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in
+ def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+ VEX, Sched<[WriteVecLoad]>;
def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))]>,
@@ -4215,6 +4238,10 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
[(store (i64 (bitconvert FR64:$src)), addr:$dst)]>,
VEX, Sched<[WriteVecStore]>;
+ def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+ Sched<[WriteVecLoad]>;
def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64:$src))]>,
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f5857a330fa..13380e03e32 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -86,8 +86,8 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_write_b16 v1, v2 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 7
-; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7f0000, v0
+; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6
; GFX9-NEXT: ds_write_b32 v1, v3
; GFX9-NEXT: s_endpgm
store i55 %arg, i55 addrspace(3)* %ptr, align 8
diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll
index f9a233a583b..41635f37528 100644
--- a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -448,6 +448,22 @@ define void @bitcast_8i32_store(i8* %p, <8 x i32> %a0) {
define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) {
; SSE2-SSSE3-LABEL: bitcast_4i64_store:
; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
; SSE2-SSSE3-NEXT: movb %al, (%rdi)
diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll
index 177be1fd6a6..3c294345dd5 100644
--- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -609,13 +609,15 @@ define void @bitcast_8i64_store(i8* %p, <8 x i64> %a0) {
;
; AVX1-LABEL: bitcast_8i64_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: movb %al, (%rdi)
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index adcee2abe33..ed487ef8266 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -63,12 +63,12 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind {
; AVX12: # %bb.0:
; AVX12-NEXT: vmovmskps %xmm0, %eax
; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $2, %ecx
-; AVX12-NEXT: vmovd %ecx, %xmm0
-; AVX12-NEXT: andl $3, %eax
-; AVX12-NEXT: vmovd %eax, %xmm1
-; AVX12-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX12-NEXT: vpextrb $0, %xmm0, %eax
+; AVX12-NEXT: andl $3, %ecx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shrl $2, %eax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX12-NEXT: vpextrb $0, %xmm1, %eax
; AVX12-NEXT: addb %cl, %al
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: retq
@@ -81,9 +81,10 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind {
; AVX512-NEXT: movzbl %al, %ecx
; AVX512-NEXT: shrl $2, %ecx
; AVX512-NEXT: andl $3, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm0
+; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: andl $3, %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpextrb $0, %xmm1, %ecx
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
@@ -119,9 +120,9 @@ define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind {
; AVX12-NEXT: vpmovmskb %xmm0, %eax
; AVX12-NEXT: movzbl %al, %ecx
; AVX12-NEXT: shrl $4, %ecx
-; AVX12-NEXT: vmovd %ecx, %xmm0
+; AVX12-NEXT: vmovq %rcx, %xmm0
; AVX12-NEXT: andl $15, %eax
-; AVX12-NEXT: vmovd %eax, %xmm1
+; AVX12-NEXT: vmovq %rax, %xmm1
; AVX12-NEXT: vpextrb $0, %xmm1, %ecx
; AVX12-NEXT: vpextrb $0, %xmm0, %eax
; AVX12-NEXT: addb %cl, %al
@@ -134,9 +135,10 @@ define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind {
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movzbl %al, %ecx
; AVX512-NEXT: shrl $4, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm0
+; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpextrb $0, %xmm1, %ecx
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
@@ -208,6 +210,22 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
; SSE2-SSSE3-LABEL: bitcast_v4i64_to_v2i2:
; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: por %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
; SSE2-SSSE3-NEXT: movl %eax, %ecx
@@ -225,12 +243,12 @@ define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
; AVX12: # %bb.0:
; AVX12-NEXT: vmovmskpd %ymm0, %eax
; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $2, %ecx
-; AVX12-NEXT: vmovd %ecx, %xmm0
-; AVX12-NEXT: andl $3, %eax
-; AVX12-NEXT: vmovd %eax, %xmm1
-; AVX12-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX12-NEXT: vpextrb $0, %xmm0, %eax
+; AVX12-NEXT: andl $3, %ecx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shrl $2, %eax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX12-NEXT: vpextrb $0, %xmm1, %eax
; AVX12-NEXT: addb %cl, %al
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: vzeroupper
@@ -244,9 +262,10 @@ define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
; AVX512-NEXT: movzbl %al, %ecx
; AVX512-NEXT: shrl $2, %ecx
; AVX512-NEXT: andl $3, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm0
+; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: andl $3, %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpextrb $0, %xmm1, %ecx
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
@@ -282,12 +301,12 @@ define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind {
; AVX12: # %bb.0:
; AVX12-NEXT: vmovmskps %ymm0, %eax
; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $4, %ecx
-; AVX12-NEXT: vmovd %ecx, %xmm0
-; AVX12-NEXT: andl $15, %eax
-; AVX12-NEXT: vmovd %eax, %xmm1
-; AVX12-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX12-NEXT: vpextrb $0, %xmm0, %eax
+; AVX12-NEXT: andl $15, %ecx
+; AVX12-NEXT: vmovq %rcx, %xmm0
+; AVX12-NEXT: shrl $4, %eax
+; AVX12-NEXT: vmovq %rax, %xmm1
+; AVX12-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX12-NEXT: vpextrb $0, %xmm1, %eax
; AVX12-NEXT: addb %cl, %al
; AVX12-NEXT: # kill: def $al killed $al killed $eax
; AVX12-NEXT: vzeroupper
@@ -300,9 +319,10 @@ define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind {
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movzbl %al, %ecx
; AVX512-NEXT: shrl $4, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm0
+; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpextrb $0, %xmm1, %ecx
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
@@ -516,20 +536,22 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
;
; AVX1-LABEL: bitcast_v8i64_to_v2i4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
; AVX1-NEXT: vpextrb $0, %xmm0, %eax
; AVX1-NEXT: addb %cl, %al
@@ -547,9 +569,9 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $4, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
; AVX2-NEXT: addb %cl, %al
@@ -564,9 +586,10 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movzbl %al, %ecx
; AVX512-NEXT: shrl $4, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm0
+; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpextrb $0, %xmm1, %ecx
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: addb %cl, %al
diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index a532d87170d..bf1dab35875 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -14,11 +14,18 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n
;
; X64-LABEL: t:
; X64: ## %bb.0: ## %entry
+; X64-NEXT: ## kill: def $edx killed $edx def $rdx
+; X64-NEXT: ## kill: def $esi killed $esi def $rsi
; X64-NEXT: imull %ecx, %esi
-; X64-NEXT: addl %edx, %esi
-; X64-NEXT: movslq %esi, %rax
+; X64-NEXT: leal (%rsi,%rdx), %eax
+; X64-NEXT: cltq
; X64-NEXT: movl (%rdi,%rax), %eax
-; X64-NEXT: movq %rax, %xmm0
+; X64-NEXT: leal 4(%rsi,%rdx), %ecx
+; X64-NEXT: movslq %ecx, %rcx
+; X64-NEXT: movzwl (%rdi,%rcx), %ecx
+; X64-NEXT: shlq $32, %rcx
+; X64-NEXT: orq %rax, %rcx
+; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
index f1397383e89..9937ca08aaf 100755
--- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -2216,6 +2216,12 @@ body: |
$edi = VCVTTSS2SIZrr $xmm0
; CHECK: $edi = VCVTTSS2SIrr_Int $xmm0
$edi = VCVTTSS2SIZrr_Int $xmm0
+ ; CHECK: $xmm0 = VMOV64toSDrr $rdi
+ $xmm0 = VMOV64toSDZrr $rdi
+ ; CHECK: $xmm0 = VMOVDI2SSrm $rip, $noreg, $noreg, $noreg, $noreg
+ $xmm0 = VMOVDI2SSZrm $rip, $noreg, $noreg, $noreg, $noreg
+ ; CHECK: $xmm0 = VMOVDI2SSrr $eax
+ $xmm0 = VMOVDI2SSZrr $eax
; CHECK: VMOVSDmr $rdi, $xmm0, $noreg, $noreg, $noreg, $noreg
VMOVSDZmr $rdi, $xmm0, $noreg, $noreg, $noreg, $noreg
; CHECK: $xmm0 = VMOVSDrm $rip, $noreg, $noreg, $noreg, $noreg
@@ -2244,6 +2250,8 @@ body: |
$xmm0 = VMOV64toPQIZrr $rdi
; CHECK: $xmm0 = VMOV64toPQIrm $rdi, $noreg, $noreg, $noreg, $noreg
$xmm0 = VMOV64toPQIZrm $rdi, $noreg, $noreg, $noreg, $noreg
+ ; CHECK: $xmm0 = VMOV64toSDrr $rdi
+ $xmm0 = VMOV64toSDZrr $rdi
; CHECK: $xmm0 = VMOVDI2PDIrm $rip, $noreg, $noreg, $noreg, $noreg
$xmm0 = VMOVDI2PDIZrm $rip, $noreg, $noreg, $noreg, $noreg
; CHECK: $xmm0 = VMOVDI2PDIrr $edi
@@ -4528,6 +4536,12 @@ body: |
$edi = VCVTTSS2SIZrr $xmm16
; CHECK: $edi = VCVTTSS2SIZrr_Int $xmm16
$edi = VCVTTSS2SIZrr_Int $xmm16
+ ; CHECK: $xmm16 = VMOV64toSDZrr $rdi
+ $xmm16 = VMOV64toSDZrr $rdi
+ ; CHECK: $xmm16 = VMOVDI2SSZrm $rip, $noreg, $noreg, $noreg, $noreg
+ $xmm16 = VMOVDI2SSZrm $rip, $noreg, $noreg, $noreg, $noreg
+ ; CHECK: $xmm16 = VMOVDI2SSZrr $eax
+ $xmm16 = VMOVDI2SSZrr $eax
; CHECK: VMOVSDZmr $rdi, $xmm16, $noreg, $noreg, $noreg, $noreg
VMOVSDZmr $rdi, $xmm16, $noreg, $noreg, $noreg, $noreg
; CHECK: $xmm16 = VMOVSDZrm $rip, $noreg, $noreg, $noreg, $noreg
@@ -4556,6 +4570,8 @@ body: |
$xmm16 = VMOV64toPQIZrr $rdi
; CHECK: $xmm16 = VMOV64toPQIZrm $rdi, $noreg, $noreg, $noreg, $noreg
$xmm16 = VMOV64toPQIZrm $rdi, $noreg, $noreg, $noreg, $noreg
+ ; CHECK: $xmm16 = VMOV64toSDZrr $rdi
+ $xmm16 = VMOV64toSDZrr $rdi
; CHECK: $xmm16 = VMOVDI2PDIZrm $rip, $noreg, $noreg, $noreg, $noreg
$xmm16 = VMOVDI2PDIZrm $rip, $noreg, $noreg, $noreg, $noreg
; CHECK: $xmm16 = VMOVDI2PDIZrr $edi
diff --git a/llvm/test/CodeGen/X86/fast-isel-fneg.ll b/llvm/test/CodeGen/X86/fast-isel-fneg.ll
index 302c6b5411f..0c2ce6df0a4 100644
--- a/llvm/test/CodeGen/X86/fast-isel-fneg.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-fneg.ll
@@ -5,9 +5,10 @@
define double @doo(double %x) nounwind {
; CHECK-LABEL: doo:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: subsd %xmm0, %xmm1
-; CHECK-NEXT: movapd %xmm1, %xmm0
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: movabsq $-9223372036854775808, %rcx ## imm = 0x8000000000000000
+; CHECK-NEXT: xorq %rax, %rcx
+; CHECK-NEXT: movq %rcx, %xmm0
; CHECK-NEXT: retq
;
; SSE2-LABEL: doo:
@@ -30,9 +31,9 @@ define double @doo(double %x) nounwind {
define float @foo(float %x) nounwind {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0:
-; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: subss %xmm0, %xmm1
-; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: xorl $2147483648, %eax ## imm = 0x80000000
+; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: retq
;
; SSE2-LABEL: foo:
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index efbb1ef8cc6..fa70edbf121 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -37,21 +37,25 @@ define void @store_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x dou
define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %val) {
; SSE2-LABEL: store_v2f64_v2i64:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movd %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
; SSE2-NEXT: movlpd %xmm1, (%rdi)
; SSE2-NEXT: LBB1_2: ## %else
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB1_4
@@ -120,16 +124,20 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: pand %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movd %xmm7, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
; SSE2-NEXT: movlpd %xmm2, (%rdi)
; SSE2-NEXT: LBB2_2: ## %else
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB2_4
@@ -139,9 +147,10 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pextrw $0, %xmm0, %eax
@@ -893,21 +902,25 @@ define void @store_v16f32_v16i32(<16 x float> %x, <16 x float>* %ptr, <16 x floa
define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %val) {
; SSE2-LABEL: store_v2i64_v2i64:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movd %xmm4, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB7_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
; SSE2-NEXT: movq %xmm1, (%rdi)
; SSE2-NEXT: LBB7_2: ## %else
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB7_4
@@ -982,16 +995,20 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: pand %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movd %xmm7, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB8_2
; SSE2-NEXT: ## %bb.1: ## %cond.store
; SSE2-NEXT: movq %xmm2, (%rdi)
; SSE2-NEXT: LBB8_2: ## %else
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pextrw $4, %xmm0, %eax
; SSE2-NEXT: testb $1, %al
; SSE2-NEXT: je LBB8_4
@@ -1002,9 +1019,10 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pextrw $0, %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index c8b3488af16..2e6123ff014 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -929,6 +929,22 @@ define i1 @allzeros_v16i32_sign(<16 x i32> %arg) {
define i1 @allones_v4i64_sign(<4 x i64> %arg) {
; SSE2-LABEL: allones_v4i64_sign:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: cmpb $15, %al
@@ -973,6 +989,22 @@ define i1 @allones_v4i64_sign(<4 x i64> %arg) {
define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
; SSE2-LABEL: allzeros_v4i64_sign:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: testb %al, %al
@@ -1063,13 +1095,15 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) {
;
; AVX1-LABEL: allones_v8i64_sign:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: cmpb $-1, %al
@@ -1164,13 +1198,15 @@ define i1 @allzeros_v8i64_sign(<8 x i64> %arg) {
;
; AVX1-LABEL: allzeros_v8i64_sign:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
@@ -2503,17 +2539,19 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) {
;
; AVX1-LABEL: allones_v8i64_and1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: cmpb $-1, %al
@@ -2577,17 +2615,19 @@ define i1 @allzeros_v8i64_and1(<8 x i64> %arg) {
;
; AVX1-LABEL: allzeros_v8i64_and1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
@@ -3922,17 +3962,19 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) {
;
; AVX1-LABEL: allones_v8i64_and4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: cmpb $-1, %al
@@ -3996,17 +4038,19 @@ define i1 @allzeros_v8i64_and4(<8 x i64> %arg) {
;
; AVX1-LABEL: allzeros_v8i64_and4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
@@ -4126,6 +4170,22 @@ define i32 @movmskps(<4 x float> %x) {
define i32 @movmskpd256(<4 x double> %x) {
; SSE2-LABEL: movmskpd256:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/peephole.mir b/llvm/test/CodeGen/X86/peephole.mir
new file mode 100644
index 00000000000..28ce9f1f0e8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/peephole.mir
@@ -0,0 +1,40 @@
+# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt %s -o - | FileCheck %s
+--- |
+ define void @func() { ret void }
+...
+---
+# Check that instructions with MI.isBitcast() are only replaced by COPY if there
+# are no SUBREG_TO_REG users.
+# CHECK-LABEL: name: func
+name: func
+registers:
+ - { id: 0, class: gr32 }
+ - { id: 1, class: fr32 }
+ - { id: 2, class: gr32 }
+
+ - { id: 3, class: gr32 }
+ - { id: 4, class: fr32 }
+ - { id: 5, class: gr32 }
+ - { id: 6, class: gr64 }
+
+body: |
+ bb.0:
+ ; CHECK: %1:fr32 = VMOVDI2SSrr %0
+ ; CHECK: %7:gr32 = COPY %0
+ ; CHECK: NOOP implicit %7
+ %0 = MOV32ri 42
+ %1 = VMOVDI2SSrr %0
+ %2 = MOVSS2DIrr %1
+ NOOP implicit %2
+
+ ; CHECK: %4:fr32 = VMOVDI2SSrr %3
+ ; CHECK-NOT: COPY
+ ; CHECK: %5:gr32 = MOVSS2DIrr %4
+ ; CHECK: %6:gr64 = SUBREG_TO_REG %5, 0
+ ; CHECK: NOOP implicit %6
+ %3 = MOV32ri 42
+ %4 = VMOVDI2SSrr %3
+ %5 = MOVSS2DIrr %4
+ %6 = SUBREG_TO_REG %5, 0, %subreg.sub_32bit
+ NOOP implicit %6
+...
diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll
deleted file mode 100644
index 7c71f2c1c29..00000000000
--- a/llvm/test/CodeGen/X86/pr41619.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.14.0 -mattr=avx2 | FileCheck %s
-
-define void @foo(double %arg) {
-; CHECK-LABEL: foo:
-; CHECK: ## %bb.0: ## %bb
-; CHECK-NEXT: vmovq %xmm0, %rax
-; CHECK-NEXT: vmovd %eax, %xmm0
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vmovq %xmm0, %rax
-; CHECK-NEXT: movl %eax, (%rax)
-; CHECK-NEXT: vmovlps %xmm1, (%rax)
-; CHECK-NEXT: retq
-bb:
- %tmp = bitcast double %arg to i64
- %tmp1 = trunc i64 %tmp to i32
- %tmp2 = bitcast i32 %tmp1 to float
- %tmp3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 2
- %tmp4 = bitcast <4 x float> %tmp3 to <2 x double>
- %tmp5 = extractelement <2 x double> %tmp4, i32 0
- %tmp6 = extractelement <2 x double> %tmp4, i32 1
- %tmp7 = bitcast double %tmp6 to i64
- %tmp8 = trunc i64 %tmp7 to i32
- store i32 %tmp8, i32* undef, align 4
- store double %tmp5, double* undef, align 16
- ret void
-}
OpenPOWER on IntegriCloud