summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/Target/TargetLowering.h4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp133
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h4
-rw-r--r--llvm/test/CodeGen/X86/vector-sext.ll223
-rw-r--r--llvm/test/CodeGen/X86/vector-zext.ll42
6 files changed, 189 insertions, 220 deletions
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index 12c4f9bb547..840dd03822b 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -1518,6 +1518,10 @@ public:
return false;
}
+ /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+ /// extend node) is profitable.
+ virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; }
+
/// Return true if an fneg operation is free to the point where it is never
/// worthwhile to replace it with a bitwise operation.
virtual bool isFNegFree(EVT VT) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 466e3606070..420883beafc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -327,6 +327,7 @@ namespace {
SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
unsigned HiOp);
SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+ SDValue CombineExtLoad(SDNode *N);
SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
SDValue BuildSDIV(SDNode *N);
SDValue BuildSDIVPow2(SDNode *N);
@@ -5307,6 +5308,102 @@ void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
}
}
+// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
+SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ EVT DstVT = N->getValueType(0);
+ EVT SrcVT = N0.getValueType();
+
+ assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND) &&
+ "Unexpected node type (not an extend)!");
+
+ // fold (sext (load x)) to multiple smaller sextloads; same for zext.
+ // For example, on a target with legal v4i32, but illegal v8i32, turn:
+ // (v8i32 (sext (v8i16 (load x))))
+ // into:
+ // (v8i32 (concat_vectors (v4i32 (sextload x)),
+ // (v4i32 (sextload (x + 16)))))
+ // Where uses of the original load, i.e.:
+ // (v8i16 (load x))
+ // are replaced with:
+ // (v8i16 (truncate
+ // (v8i32 (concat_vectors (v4i32 (sextload x)),
+ // (v4i32 (sextload (x + 16)))))))
+ //
+ // This combine is only applicable to illegal, but splittable, vectors.
+ // All legal types, and illegal non-vector types, are handled elsewhere.
+ // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
+ //
+ if (N0->getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+ if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
+ !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
+ !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+ return SDValue();
+
+ SmallVector<SDNode *, 4> SetCCs;
+ if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
+ return SDValue();
+
+ ISD::LoadExtType ExtType =
+ N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+
+ // Try to split the vector types to get down to legal types.
+ EVT SplitSrcVT = SrcVT;
+ EVT SplitDstVT = DstVT;
+ while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
+ SplitSrcVT.getVectorNumElements() > 1) {
+ SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
+ SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
+ }
+
+ if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
+ return SDValue();
+
+ SDLoc DL(N);
+ const unsigned NumSplits =
+ DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
+ const unsigned Stride = SplitSrcVT.getStoreSize();
+ SmallVector<SDValue, 4> Loads;
+ SmallVector<SDValue, 4> Chains;
+
+ SDValue BasePtr = LN0->getBasePtr();
+ for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
+ const unsigned Offset = Idx * Stride;
+ const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
+
+ SDValue SplitLoad = DAG.getExtLoad(
+ ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
+ LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT,
+ LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(),
+ Align, LN0->getAAInfo());
+
+ BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(Stride, BasePtr.getValueType()));
+
+ Loads.push_back(SplitLoad.getValue(0));
+ Chains.push_back(SplitLoad.getValue(1));
+ }
+
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
+
+ CombineTo(N, NewValue);
+
+ // Replace uses of the original load (before extension)
+ // with a truncate of the concatenated sextloaded vectors.
+ SDValue Trunc =
+ DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
+ CombineTo(N0.getNode(), Trunc, NewChain);
+ ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
+ (ISD::NodeType)N->getOpcode());
+ return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -5373,17 +5470,18 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
}
// fold (sext (load x)) -> (sext (truncate (sextload x)))
- // None of the supported targets knows how to perform load and sign extend
- // on vectors in one instruction. We only perform this transformation on
- // scalars.
- if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
- ISD::isUNINDEXEDLoad(N0.getNode()) &&
- ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+ // Only generate vector extloads when 1) they're legal, and 2) they are
+ // deemed desirable by the target.
+ if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+ ((!LegalOperations && !VT.isVector() &&
+ !cast<LoadSDNode>(N0)->isVolatile()) ||
TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
bool DoXform = true;
SmallVector<SDNode*, 4> SetCCs;
if (!N0.hasOneUse())
DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+ if (VT.isVector())
+ DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
if (DoXform) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
@@ -5400,6 +5498,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
}
}
+ // fold (sext (load x)) to multiple smaller sextloads.
+ // Only on illegal but splittable vectors.
+ if (SDValue ExtLoad = CombineExtLoad(N))
+ return ExtLoad;
+
// fold (sext (sextload x)) -> (sext (truncate (sextload x)))
// fold (sext ( extload x)) -> (sext (truncate (sextload x)))
if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
@@ -5663,17 +5766,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
}
// fold (zext (load x)) -> (zext (truncate (zextload x)))
- // None of the supported targets knows how to perform load and vector_zext
- // on vectors in one instruction. We only perform this transformation on
- // scalars.
- if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
- ISD::isUNINDEXEDLoad(N0.getNode()) &&
- ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+ // Only generate vector extloads when 1) they're legal, and 2) they are
+ // deemed desirable by the target.
+ if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+ ((!LegalOperations && !VT.isVector() &&
+ !cast<LoadSDNode>(N0)->isVolatile()) ||
TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
bool DoXform = true;
SmallVector<SDNode*, 4> SetCCs;
if (!N0.hasOneUse())
DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+ if (VT.isVector())
+ DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
if (DoXform) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
@@ -5691,6 +5795,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
}
}
+ // fold (zext (load x)) to multiple smaller zextloads.
+ // Only on illegal but splittable vectors.
+ if (SDValue ExtLoad = CombineExtLoad(N))
+ return ExtLoad;
+
// fold (zext (and/or/xor (load x), cst)) ->
// (and/or/xor (zextload x), (zext cst))
if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fa19e7d0635..08b2d04c5d0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16294,6 +16294,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
@@ -20399,6 +20400,8 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d90dcc71451..eb904f31788 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -744,6 +744,10 @@ namespace llvm {
bool isZExtFree(EVT VT1, EVT VT2) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+ /// extend node) is profitable.
+ bool isVectorLoadExtDesirable(SDValue) const override;
+
/// Return true if an FMA operation is faster than a pair of fmul and fadd
/// instructions. fmuladd intrinsics will be expanded to FMAs when this
/// method returns true, otherwise fmuladd is expanded to fmul + fadd.
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 758833155a9..962d0381cd2 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -523,46 +523,35 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: sext_16i8_to_16i16:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movq (%rdi), %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: movq 8(%rdi), %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_16i16:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: movq (%rdi), %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psllw $8, %xmm0
; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: psllw $8, %xmm1
+; SSSE3-NEXT: movq 8(%rdi), %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_16i16:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxbw %xmm1, %xmm0
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i8_to_16i16:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i8_to_16i16:
@@ -573,13 +562,8 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
; X32-SSE41-LABEL: sext_16i8_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movdqa (%eax), %xmm1
-; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0
-; X32-SSE41-NEXT: psllw $8, %xmm0
-; X32-SSE41-NEXT: psraw $8, %xmm0
-; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-SSE41-NEXT: psllw $8, %xmm1
-; X32-SSE41-NEXT: psraw $8, %xmm1
+; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1
; X32-SSE41-NEXT: retl
entry:
%X = load <16 x i8>* %ptr
@@ -705,73 +689,36 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_sext_4i8_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movd (%rdi), %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movsbq 1(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movsbq (%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movsbq 3(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movsbq 2(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
-; SSE2-NEXT: movd %rax, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i8_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movd (%rdi), %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movsbq 1(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movsbq (%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movsbq 3(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movsbq 2(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
-; SSSE3-NEXT: movd %rax, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i8_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: pmovzxbd (%rdi), %xmm1
-; SSE41-NEXT: pmovzxdq %xmm1, %xmm0
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm1, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i8_to_4i64:
@@ -791,30 +738,8 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movd (%eax), %xmm0
-; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1
-; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: movd %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: movd %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
; X32-SSE41-NEXT: retl
entry:
%X = load <4 x i8>* %ptr
@@ -825,72 +750,36 @@ entry:
define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_sext_4i16_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movq (%rdi), %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movswq 2(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movswq (%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movswq 6(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movswq 4(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
-; SSE2-NEXT: movd %rax, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i16_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movq (%rdi), %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movswq 2(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movswq (%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movswq 6(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movswq 4(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
-; SSSE3-NEXT: movd %rax, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i16_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movq (%rdi), %xmm0
-; SSE41-NEXT: pmovzxwd %xmm0, %xmm1
-; SSE41-NEXT: pmovzxwq %xmm0, %xmm0
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm1, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i16_to_4i64:
@@ -910,30 +799,8 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movsd (%eax), %xmm0
-; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1
-; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: movd %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: movd %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1
; X32-SSE41-NEXT: retl
entry:
%X = load <4 x i16>* %ptr
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 2ec3048af35..3c0e5ddf035 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -230,20 +230,14 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE41-LABEL: load_zext_16i8_to_16i16:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxbw %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pmovzxbw (%rdi), %xmm0
+; SSE41-NEXT: pmovzxbw 8(%rdi), %xmm1
; SSE41-NEXT: retq
; AVX1-LABEL: load_zext_16i8_to_16i16:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxbw 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -282,20 +276,14 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSE41-LABEL: load_zext_8i16_to_8i32:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxwd %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pmovzxwd (%rdi), %xmm0
+; SSE41-NEXT: pmovzxwd 8(%rdi), %xmm1
; SSE41-NEXT: retq
; AVX1-LABEL: load_zext_8i16_to_8i32:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxwd (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxwd 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -332,20 +320,14 @@ define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSE41-LABEL: load_zext_4i32_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxdq %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pshufd $250, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pmovzxdq (%rdi), %xmm0
+; SSE41-NEXT: pmovzxdq 8(%rdi), %xmm1
; SSE41-NEXT: retq
; AVX1-LABEL: load_zext_4i32_to_4i64:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxdq (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxdq 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
OpenPOWER on IntegriCloud