summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2017-02-09 11:50:19 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2017-02-09 11:50:19 +0000
commit563e23e66e5fd383147f5e0d9b4fa67f265a422a (patch)
treec66ae478fe531c34f8f6a4d172e17671c8ad32e7
parent4200948c5a6e45a4a076378651f40f08dfdc33e1 (diff)
downloadbcm5719-llvm-563e23e66e5fd383147f5e0d9b4fa67f265a422a.tar.gz
bcm5719-llvm-563e23e66e5fd383147f5e0d9b4fa67f265a422a.zip
[X86][SSE] Attempt to break register dependencies during lowerBuildVector
LowerBuildVectorv16i8/LowerBuildVectorv8i16 insert values into a UNDEF vector if the build vector doesn't contain any zero elements, resulting in register dependencies with a previous use of the register. This patch attempts to break the register dependency by either always zeroing the vector before hand or (if we're inserting to the 0'th element) by using VZEXT_MOVL(SCALAR_TO_VECTOR(i32 AEXT(Elt))) which lowers to (V)MOVD and performs a similar function. Additionally (V)MOVD is a shorter instruction than PINSRB/PINSRW. We already do something similar for SSE41 PINSRD. On pre-SSE41 LowerBuildVectorv16i8 we go a little further and use VZEXT_MOVL(SCALAR_TO_VECTOR(i32 ZEXT(Elt))) if the build vector contains zeros to avoid the vector zeroing at the cost of a scalar zero extension, which can probably be brought over to the other cases in a future patch in some cases (load folding etc.) Differential Revision: https://reviews.llvm.org/D29720 llvm-svn: 294581
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp51
-rw-r--r--llvm/test/CodeGen/X86/avx512-intrinsics.ll8
-rw-r--r--llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll8
-rw-r--r--llvm/test/CodeGen/X86/avx512vl-intrinsics.ll32
-rw-r--r--llvm/test/CodeGen/X86/buildvec-insertvec.ll7
-rw-r--r--llvm/test/CodeGen/X86/promote-vec3.ll23
-rw-r--r--llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll6
-rw-r--r--llvm/test/CodeGen/X86/vec_cast2.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll20
-rw-r--r--llvm/test/CodeGen/X86/widen_bitops-0.ll12
-rw-r--r--llvm/test/CodeGen/X86/widen_conv-3.ll4
-rw-r--r--llvm/test/CodeGen/X86/widen_conv-4.ll4
12 files changed, 110 insertions, 81 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 807acd5b5f8..6f9f6771e33 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5942,12 +5942,21 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
for (unsigned i = 0; i < 16; ++i) {
bool IsNonZero = (NonZeros & (1 << i)) != 0;
if (IsNonZero) {
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
if (First) {
- if (NumZero)
- V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
- else
- V = DAG.getUNDEF(MVT::v16i8);
First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v16i8, V);
+ continue;
+ }
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
@@ -5969,6 +5978,8 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
}
if ((i & 1) != 0) {
+ // FIXME: Investigate extending to i32 instead of just i16.
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
SDValue ThisElt, LastElt;
bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
if (LastIsNonZero) {
@@ -5984,9 +5995,18 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
} else
ThisElt = LastElt;
- if (ThisElt)
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i / 2, dl));
+ if (ThisElt) {
+ if (1 == i) {
+ V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
+ : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ } else {
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i / 2, dl));
+ }
+ }
}
}
@@ -6007,12 +6027,21 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
for (unsigned i = 0; i < 8; ++i) {
bool IsNonZero = (NonZeros & (1 << i)) != 0;
if (IsNonZero) {
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
if (First) {
- if (NumZero)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
- else
- V = DAG.getUNDEF(MVT::v8i16);
First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
+ }
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 1213fb1ec66..f889526baa3 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1062,7 +1062,7 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k4, %eax
; CHECK-NEXT: kmovw %k3, %ecx
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
@@ -1110,7 +1110,7 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k3 {%k3}
; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: kmovw %k4, %ecx
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
@@ -1159,7 +1159,7 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k4, %eax
; CHECK-NEXT: kmovw %k3, %ecx
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
@@ -1207,7 +1207,7 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k3 {%k3}
; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: kmovw %k4, %ecx
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index c34fac3c994..79d8e53c514 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -696,7 +696,7 @@ define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x07]
; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -744,7 +744,7 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xd9,0x07]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -793,7 +793,7 @@ define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x07]
; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -841,7 +841,7 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xd9,0x07]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 6510e6d7ac2..a067e0ad27b 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -16,7 +16,7 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x07]
; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -64,7 +64,7 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xd9,0x07]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -113,7 +113,7 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x07]
; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -161,7 +161,7 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xd9,0x07]
; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -210,7 +210,7 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x07]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -258,7 +258,7 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xf9,0x07]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -307,7 +307,7 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x07]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -355,7 +355,7 @@ define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xf9,0x07]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -406,7 +406,7 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x07]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -454,7 +454,7 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xf9,0x07]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -503,7 +503,7 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x07]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -551,7 +551,7 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xf9,0x07]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -600,7 +600,7 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x07]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -648,7 +648,7 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xf9,0x07]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -697,7 +697,7 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x07]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
@@ -745,7 +745,7 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xf9,0x07]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 61d8a4fdea4..730376acdc9 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -270,6 +270,7 @@ define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16
define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16 %a5) {
; CHECK-LABEL: test_buildvector_v8i16_partial:
; CHECK: # BB#0:
+; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: pinsrw $1, %edi, %xmm0
; CHECK-NEXT: pinsrw $3, %esi, %xmm0
; CHECK-NEXT: pinsrw $4, %edx, %xmm0
@@ -419,6 +420,7 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11
;
; SSE41-LABEL: test_buildvector_v16i8_partial:
; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
; SSE41-NEXT: pinsrb $6, %esi, %xmm0
; SSE41-NEXT: pinsrb $8, %edx, %xmm0
@@ -448,10 +450,9 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11
define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
; SSE2-LABEL: test_buildvector_v16i8_register_zero:
; SSE2: # BB#0:
-; SSE2-NEXT: movzbl %dil, %eax
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pinsrw $0, %eax, %xmm0
; SSE2-NEXT: movzbl %sil, %eax
+; SSE2-NEXT: movzbl %dil, %esi
+; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: pinsrw $2, %eax, %xmm0
; SSE2-NEXT: movzbl %dl, %eax
; SSE2-NEXT: pinsrw $3, %eax, %xmm0
diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll
index 7a496714622..5483090dab6 100644
--- a/llvm/test/CodeGen/X86/promote-vec3.ll
+++ b/llvm/test/CodeGen/X86/promote-vec3.ll
@@ -9,17 +9,16 @@ define <3 x i16> @zext_i8(<3 x i8>) {
; SSE3-LABEL: zext_i8:
; SSE3: # BB#0:
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: pxor %xmm1, %xmm1
-; SSE3-NEXT: pinsrw $0, %eax, %xmm1
+; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: pinsrw $1, %eax, %xmm1
+; SSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: pinsrw $2, %eax, %xmm1
-; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE3-NEXT: movd %xmm1, %eax
-; SSE3-NEXT: pextrw $2, %xmm1, %edx
-; SSE3-NEXT: pextrw $4, %xmm1, %ecx
+; SSE3-NEXT: pinsrw $2, %eax, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pextrw $2, %xmm0, %edx
+; SSE3-NEXT: pextrw $4, %xmm0, %ecx
; SSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; SSE3-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
; SSE3-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
@@ -74,7 +73,7 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; SSE3-LABEL: sext_i8:
; SSE3: # BB#0:
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -93,7 +92,7 @@ define <3 x i16> @sext_i8(<3 x i8>) {
;
; SSE41-LABEL: sext_i8:
; SSE41: # BB#0:
-; SSE41-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
; SSE41-NEXT: pslld $24, %xmm0
@@ -108,7 +107,7 @@ define <3 x i16> @sext_i8(<3 x i8>) {
;
; AVX-32-LABEL: sext_i8:
; AVX-32: # BB#0:
-; AVX-32-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpslld $24, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 923290411ae..8f9b8c156d3 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -482,7 +482,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx
; AVX512BW-NEXT: vpextrb $0, %xmm0, %edi
-; AVX512BW-NEXT: vpinsrb $0, %edi, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %edi, %xmm0
; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
@@ -496,9 +496,9 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX512BWVL-NEXT: vmovd %ecx, %xmm1
; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll
index dda50b7b94b..f723672141a 100644
--- a/llvm/test/CodeGen/X86/vec_cast2.ll
+++ b/llvm/test/CodeGen/X86/vec_cast2.ll
@@ -97,10 +97,10 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
;
; CHECK-WIDE-LABEL: foo3_8:
; CHECK-WIDE: ## BB#0:
-; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
-; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
@@ -134,10 +134,10 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
;
; CHECK-WIDE-LABEL: foo3_4:
; CHECK-WIDE: ## BB#0:
-; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
-; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1
; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index cb710c8205b..47590cb8447 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1718,17 +1718,17 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
; SSE2-NEXT: movzbl (%rsi), %ecx
; SSE2-NEXT: shll $8, %ecx
; SSE2-NEXT: orl %eax, %ecx
-; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movzwl %cx, %eax
+; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pinsrw $0, %ecx, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,5,4,4,4]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7]
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR31364:
@@ -1737,8 +1737,8 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
; SSSE3-NEXT: movzbl (%rsi), %ecx
; SSSE3-NEXT: shll $8, %ecx
; SSSE3-NEXT: orl %eax, %ecx
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: pinsrw $0, %ecx, %xmm0
+; SSSE3-NEXT: movzwl %cx, %eax
+; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
; SSSE3-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/widen_bitops-0.ll b/llvm/test/CodeGen/X86/widen_bitops-0.ll
index f8316d0e1ea..132a2fd928f 100644
--- a/llvm/test/CodeGen/X86/widen_bitops-0.ll
+++ b/llvm/test/CodeGen/X86/widen_bitops-0.ll
@@ -131,10 +131,10 @@ define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind {
define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-LABEL: and_v3i8_as_i24:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pand %xmm0, %xmm1
@@ -172,10 +172,10 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-LABEL: xor_v3i8_as_i24:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pxor %xmm0, %xmm1
@@ -213,10 +213,10 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-LABEL: or_v3i8_as_i24:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/widen_conv-3.ll b/llvm/test/CodeGen/X86/widen_conv-3.ll
index f2e29337e6a..4ae19b8f5d2 100644
--- a/llvm/test/CodeGen/X86/widen_conv-3.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-3.ll
@@ -65,7 +65,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: shll $8, %edx
; X86-SSE2-NEXT: movzbl (%esp), %esi
; X86-SSE2-NEXT: orl %edx, %esi
-; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT: movd %esi, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -115,7 +115,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: shll $8, %eax
; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: orl %eax, %ecx
-; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT: movd %ecx, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll
index 90c4bbe6bb7..e574407f980 100644
--- a/llvm/test/CodeGen/X86/widen_conv-4.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-4.ll
@@ -91,7 +91,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: shll $8, %edx
; X86-SSE2-NEXT: movzbl (%esp), %esi
; X86-SSE2-NEXT: orl %edx, %esi
-; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT: movd %esi, %xmm0
; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -140,7 +140,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: shll $8, %eax
; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: orl %eax, %ecx
-; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT: movd %ecx, %xmm0
; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
OpenPOWER on IntegriCloud