summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChandler Carruth <chandlerc@gmail.com>2014-10-02 21:37:14 +0000
committerChandler Carruth <chandlerc@gmail.com>2014-10-02 21:37:14 +0000
commit75e182b4149c9faa340089d216b576da6b932c9e (patch)
tree9885b6da670d4680041f9c0206f1b3553a8c756a
parent1b0d24e03abf765ba4d84b523b259bb60b328920 (diff)
downloadbcm5719-llvm-75e182b4149c9faa340089d216b576da6b932c9e.tar.gz
bcm5719-llvm-75e182b4149c9faa340089d216b576da6b932c9e.zip
[x86] Teach the new vector shuffle lowering to widen floating point
elements as well as integer elements in order to form simpler shuffle patterns. This is the primary reason why we were failing to match some of the 2-and-2 floating point shuffles such as PR21140. Even after fixing this we need to support some extra patterns in the backend in order to match the resulting X86ISD::UNPCKL nodes into the correct instructions. This commit should fix PR21140 and includes more comprehensive testing of insertion patterns in v4 shuffles. Not all of the added tests are beautiful. For example, we don't have clever instructions to insert-via-load in the integer domain. There are also some places where we aren't sufficiently cunning with our use of movq and movd, but that's future work. llvm-svn: 218911
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp17
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td10
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll185
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll19
4 files changed, 213 insertions, 18 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cb27a43558f..9089d138ddc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10252,16 +10252,17 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
}
- // For integer vector shuffles, try to collapse them into a shuffle of fewer
- // lanes but wider integers. We cap this to not form integers larger than i64
- // but it might be interesting to form i128 integers to handle flipping the
- // low and high halves of AVX 256-bit vectors.
+ // Try to collapse shuffles into using a vector type with fewer elements but
+ // wider element types. We cap this to not form integers or floating point
+ // elements wider than 64 bits, but it might be interesting to form i128
+ // integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
- if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+ if (VT.getScalarSizeInBits() < 64 &&
canWidenShuffleElements(Mask, WidenedMask)) {
- MVT NewVT =
- MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
- VT.getVectorNumElements() / 2);
+ MVT NewEltVT = VT.isFloatingPoint()
+ ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+ : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+ MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
return DAG.getNode(ISD::BITCAST, dl, VT,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 486beddb4a9..f833d043027 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1378,6 +1378,11 @@ let Predicates = [HasAVX] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
+ // Also handle an i64 load because that may get selected as a faster way to
+ // load the data.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (VMOVHPDrm VR128:$src1, addr:$src2)>;
}
let Predicates = [UseSSE1] in {
@@ -1398,6 +1403,11 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
+ // Also handle an i64 load because that may get selected as a faster way to
+ // load the data.
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+ (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+ (MOVHPDrm VR128:$src1, addr:$src2)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 948f1d2e47a..a359ce7b1b8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1062,6 +1062,191 @@ define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
ret <4 x float> %shuffle
}
+define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
+; SSE2-LABEL: insert_reg_lo_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %rdi, %xmm1
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_reg_lo_v4i32:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %rdi, %xmm1
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_reg_lo_v4i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movd %rdi, %xmm1
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_reg_lo_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movd %rdi, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_reg_lo_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq %rdi, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_reg_lo_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq %rdi, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+ %a.cast = bitcast i64 %a to <2 x i32>
+ %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE2-LABEL: insert_mem_lo_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movlpd (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_mem_lo_v4i32:
+; SSE3: # BB#0:
+; SSE3-NEXT: movlpd (%rdi), %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_mem_lo_v4i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movlpd (%rdi), %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_mem_lo_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movq (%rdi), %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_mem_lo_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq (%rdi), %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_mem_lo_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq (%rdi), %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+ %a = load <2 x i32>* %ptr
+ %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
+; SSE-LABEL: insert_reg_hi_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movd %rdi, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_hi_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %rdi, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %a.cast = bitcast i64 %a to <2 x i32>
+ %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE-LABEL: insert_mem_hi_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movq (%rdi), %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_hi_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq (%rdi), %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %a = load <2 x i32>* %ptr
+ %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_lo_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movsd %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_lo_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %a.cast = bitcast double %a to <2 x float>
+ %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_lo_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movlpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_lo_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = load <2 x float>* %ptr
+ %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_hi_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_reg_hi_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
+ %a.cast = bitcast double %a to <2 x float>
+ %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_hi_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movhpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_mem_hi_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a = load <2 x float>* %ptr
+ %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x float> %shuffle
+}
+
define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
; SSE-LABEL: shuffle_mem_v4f32_3210:
; SSE: # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index f11554cc2fc..c5f3e93283c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -148,7 +148,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_01014545:
; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x float> %shuffle
@@ -202,7 +202,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
@@ -210,7 +210,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
; AVX2-LABEL: shuffle_v8f32_08080808:
; AVX2: # BB#0:
; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
-; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
@@ -651,7 +651,7 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX1-NEXT: retq
;
@@ -671,9 +671,9 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_f511235a:
; AVX1: # BB#0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,1,u,u,6,7,u,u]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
@@ -900,7 +900,7 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
@@ -1564,9 +1564,8 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,1,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX1-NEXT: retq
;
OpenPOWER on IntegriCloud