diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 68 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/insertps-combine.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll | 34 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll | 14 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_loadsingles.ll | 85 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_set-6.ll | 2 |
9 files changed, 118 insertions, 103 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9c0b5185017..6675cafd862 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5642,44 +5642,46 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && - ((VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64)) || - (VT.is256BitVector() && TLI.isTypeLegal(MVT::v4i64)) || - (VT.is512BitVector() && TLI.isTypeLegal(MVT::v8i64)))) { - MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); - SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, - LDBase->getPointerInfo(), - LDBase->getAlignment(), - false/*isVolatile*/, true/*ReadMem*/, - false/*WriteMem*/); - - // Make sure the newly-created LOAD is in the same position as LDBase in - // terms of dependency. We create a TokenFactor for LDBase and ResNode, and - // update uses of LDBase's output chain to use the TokenFactor. - if (LDBase->hasAnyUseOfValue(1)) { - SDValue NewChain = - DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), - SDValue(ResNode.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), - SDValue(ResNode.getNode(), 1)); - } + ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { + MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64); + if (TLI.isTypeLegal(VecVT)) { + SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, + LDBase->getPointerInfo(), + LDBase->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, + // and update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } - return DAG.getBitcast(VT, ResNode); + return DAG.getBitcast(VT, ResNode); + } } // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 && - ((VT.is128BitVector() && TLI.isTypeLegal(MVT::v4i32)) || - (VT.is256BitVector() && TLI.isTypeLegal(MVT::v8i32)) || - (VT.is512BitVector() && TLI.isTypeLegal(MVT::v16i32)))) { - MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); - SDValue V = CreateLoad(MVT::i32, LDBase); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); - return DAG.getBitcast(VT, V); + ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { + MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32; + MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32); + if (TLI.isTypeLegal(VecVT)) { + SDValue V = CreateLoad(VecSVT, LDBase); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); + return DAG.getBitcast(VT, V); + } } return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 4b61edde262..273b2e80335 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3046,6 +3046,8 @@ let Predicates = [HasAVX512] in { def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; // Represent the same patterns above but in the form they appear for // 512-bit types @@ -3058,6 +3060,8 @@ let Predicates = [HasAVX512] in { def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + def : Pat<(v8f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 45f923011a6..085b2c5655b 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -649,6 +649,8 @@ let Predicates = [UseAVX] in { def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } // Extract and store. diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll index fa18ca60405..51d987bfdbd 100644 --- a/llvm/test/CodeGen/X86/insertps-combine.ll +++ b/llvm/test/CodeGen/X86/insertps-combine.ll @@ -133,12 +133,12 @@ define <4 x float> @insertps_undef_input1(<4 x float> %a0, <4 x float> %a1) { define <4 x float> @consecutive_load_insertps_04zz(float* %p) { ; SSE-LABEL: consecutive_load_insertps_04zz: ; SSE: # BB#0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: consecutive_load_insertps_04zz: ; AVX: # BB#0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: retq %p0 = getelementptr inbounds float, float* %p, i64 1 %p1 = getelementptr inbounds float, float* %p, i64 2 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 56107973f03..2d991049727 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -115,18 +115,18 @@ define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline s define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_4f32_f32_34uu: ; SSE: # BB#0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_34uu: ; AVX: # BB#0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: retq ; ; X32-SSE-LABEL: merge_4f32_f32_34uu: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-SSE-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 3 %ptr1 = getelementptr inbounds float, float* %ptr, i64 4 @@ -140,23 +140,23 @@ define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline s define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_4f32_f32_34z6: ; SSE: # BB#0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,0] ; SSE-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_34z6: ; AVX: # BB#0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,0] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[1,0] ; AVX-NEXT: retq ; ; X32-SSE-LABEL: merge_4f32_f32_34z6: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,0] ; X32-SSE-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 3 @@ -174,18 +174,18 @@ define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline s define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_4f32_f32_45zz: ; SSE: # BB#0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_45zz: ; AVX: # BB#0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: retq ; ; X32-SSE-LABEL: merge_4f32_f32_45zz: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-SSE-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 4 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5 @@ -207,20 +207,20 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; ; SSE41-LABEL: merge_4f32_f32_012u: ; SSE41: # BB#0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_012u: ; AVX: # BB#0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq ; ; X32-SSE-LABEL: merge_4f32_f32_012u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X32-SSE-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 @@ -248,20 +248,20 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; ; SSE41-LABEL: merge_4f32_f32_019u: ; SSE41: # BB#0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_019u: ; AVX: # BB#0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq ; ; X32-SSE-LABEL: merge_4f32_f32_019u: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X32-SSE-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll index 86e6222e36d..f7354b5ae6a 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -298,11 +298,11 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi ; X32-AVX-LABEL: merge_8f32_2f32_23z5: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X32-AVX-NEXT: vmovupd 16(%eax), %xmm1 -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX-NEXT: vmovupd 16(%eax), %xmm0 +; X32-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X32-AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3 @@ -338,13 +338,13 @@ define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinl define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_8f32_f32_12zzuuzz: ; AVX: # BB#0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_8f32_f32_12zzuuzz: ; X32-AVX: # BB#0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 1 %ptr1 = getelementptr inbounds float, float* %ptr, i64 2 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll index 1b5e410ad6e..629af7121ce 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -271,13 +271,13 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline s define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; ALL: # BB#0: -; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, float* %ptr, i64 8 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9 @@ -347,7 +347,7 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwta define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -360,7 +360,7 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwta ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_loadsingles.ll b/llvm/test/CodeGen/X86/vec_loadsingles.ll index ecae5d96282..b0d95c5d00d 100644 --- a/llvm/test/CodeGen/X86/vec_loadsingles.ll +++ b/llvm/test/CodeGen/X86/vec_loadsingles.ll @@ -1,22 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=FAST32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=SLOW32 define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly { +; ALL-LABEL: merge_2_floats: +; ALL: # BB#0: +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: retq %tmp1 = load float, float* %p %vecins = insertelement <4 x float> undef, float %tmp1, i32 0 %add.ptr = getelementptr float, float* %p, i32 1 %tmp5 = load float, float* %add.ptr %vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1 ret <4 x float> %vecins7 - -; ALL-LABEL: merge_2_floats -; ALL: vmovq -; ALL-NEXT: retq } ; Test-case generated due to a crash when trying to treat loading the first ; two i64s of a <4 x i64> as a load of two i32s. define <4 x i64> @merge_2_floats_into_4() { +; ALL-LABEL: merge_2_floats_into_4: +; ALL: # BB#0: +; ALL-NEXT: movq (%rax), %rax +; ALL-NEXT: vmovups (%rax), %xmm0 +; ALL-NEXT: retq %1 = load i64*, i64** undef, align 8 %2 = getelementptr inbounds i64, i64* %1, i64 0 %3 = load i64, i64* %2 @@ -27,13 +33,13 @@ define <4 x i64> @merge_2_floats_into_4() { %8 = insertelement <4 x i64> %4, i64 %7, i32 1 %9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x i64> %9 - -; ALL-LABEL: merge_2_floats_into_4 -; ALL: vmovups -; ALL-NEXT: retq } define <4 x float> @merge_4_floats(float* %ptr) { +; ALL-LABEL: merge_4_floats: +; ALL: # BB#0: +; ALL-NEXT: vmovups (%rdi), %xmm0 +; ALL-NEXT: retq %a = load float, float* %ptr, align 8 %vec = insertelement <4 x float> undef, float %a, i32 0 %idx1 = getelementptr inbounds float, float* %ptr, i64 1 @@ -46,18 +52,24 @@ define <4 x float> @merge_4_floats(float* %ptr) { %d = load float, float* %idx5, align 8 %vec6 = insertelement <4 x float> %vec4, float %d, i32 3 ret <4 x float> %vec6 - -; ALL-LABEL: merge_4_floats -; ALL: vmovups -; ALL-NEXT: retq } -; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ) +; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ) ; Make sure that 32-byte vectors are handled efficiently. ; If the target has slow 32-byte accesses, we should still generate ; 16-byte loads. define <8 x float> @merge_8_floats(float* %ptr) { +; FAST32-LABEL: merge_8_floats: +; FAST32: # BB#0: +; FAST32-NEXT: vmovups (%rdi), %ymm0 +; FAST32-NEXT: retq +; +; SLOW32-LABEL: merge_8_floats: +; SLOW32: # BB#0: +; SLOW32-NEXT: vmovups (%rdi), %xmm0 +; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 +; SLOW32-NEXT: retq %a = load float, float* %ptr, align 4 %vec = insertelement <8 x float> undef, float %a, i32 0 %idx1 = getelementptr inbounds float, float* %ptr, i64 1 @@ -82,18 +94,19 @@ define <8 x float> @merge_8_floats(float* %ptr) { %h = load float, float* %idx13, align 4 %vec14 = insertelement <8 x float> %vec12, float %h, i32 7 ret <8 x float> %vec14 - -; ALL-LABEL: merge_8_floats - -; FAST32: vmovups -; FAST32-NEXT: retq - -; SLOW32: vmovups -; SLOW32-NEXT: vinsertf128 -; SLOW32-NEXT: retq } define <4 x double> @merge_4_doubles(double* %ptr) { +; FAST32-LABEL: merge_4_doubles: +; FAST32: # BB#0: +; FAST32-NEXT: vmovups (%rdi), %ymm0 +; FAST32-NEXT: retq +; +; SLOW32-LABEL: merge_4_doubles: +; SLOW32: # BB#0: +; SLOW32-NEXT: vmovups (%rdi), %xmm0 +; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 +; SLOW32-NEXT: retq %a = load double, double* %ptr, align 8 %vec = insertelement <4 x double> undef, double %a, i32 0 %idx1 = getelementptr inbounds double, double* %ptr, i64 1 @@ -106,20 +119,22 @@ define <4 x double> @merge_4_doubles(double* %ptr) { %d = load double, double* %idx5, align 8 %vec6 = insertelement <4 x double> %vec4, double %d, i32 3 ret <4 x double> %vec6 - -; ALL-LABEL: merge_4_doubles -; FAST32: vmovups -; FAST32-NEXT: retq - -; SLOW32: vmovups -; SLOW32-NEXT: vinsertf128 -; SLOW32-NEXT: retq } -; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) +; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) ; Recognize and combine consecutive loads even when the ; first of the combined loads is offset from the base address. define <4 x double> @merge_4_doubles_offset(double* %ptr) { +; FAST32-LABEL: merge_4_doubles_offset: +; FAST32: # BB#0: +; FAST32-NEXT: vmovups 32(%rdi), %ymm0 +; FAST32-NEXT: retq +; +; SLOW32-LABEL: merge_4_doubles_offset: +; SLOW32: # BB#0: +; SLOW32-NEXT: vmovups 32(%rdi), %xmm0 +; SLOW32-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0 +; SLOW32-NEXT: retq %arrayidx4 = getelementptr inbounds double, double* %ptr, i64 4 %arrayidx5 = getelementptr inbounds double, double* %ptr, i64 5 %arrayidx6 = getelementptr inbounds double, double* %ptr, i64 6 @@ -133,13 +148,5 @@ define <4 x double> @merge_4_doubles_offset(double* %ptr) { %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2 %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3 ret <4 x double> %vecinit7 - -; ALL-LABEL: merge_4_doubles_offset -; FAST32: vmovups -; FAST32-NEXT: retq - -; SLOW32: vmovups -; SLOW32-NEXT: vinsertf128 -; SLOW32-NEXT: retq } diff --git a/llvm/test/CodeGen/X86/vec_set-6.ll b/llvm/test/CodeGen/X86/vec_set-6.ll index 0713d956ee4..1d8c76d586c 100644 --- a/llvm/test/CodeGen/X86/vec_set-6.ll +++ b/llvm/test/CodeGen/X86/vec_set-6.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t ; RUN: grep movss %t | count 1 -; RUN: grep movq %t | count 1 +; RUN: grep movsd %t | count 1 ; RUN: grep shufps %t | count 1 define <4 x float> @test(float %a, float %b, float %c) nounwind { |