ARM: Improve codegen for generic vselect.

Fall back to by-element insert rather than building it up on the stack. rdar://14351991 llvm-svn: 185846
author: Jim Grosbach <grosbach@apple.com> 2013-07-08 18:18:52 +0000
committer: Jim Grosbach <grosbach@apple.com> 2013-07-08 18:18:52 +0000
commit: 24e102a947d169b6415627d5d0a547965355a18a (patch)
tree: 28045b86bf5342b3954b49e4ca14fc23cfaddabf
parent: 4b2967ff9f6ffeb41b3965d396f469d3e95ee74e (diff)
download: bcm5719-llvm-24e102a947d169b6415627d5d0a547965355a18a.tar.gz
bcm5719-llvm-24e102a947d169b6415627d5d0a547965355a18a.zip
3 files changed, 57 insertions, 30 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 991a703f818..8c4a3f13d13 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -4734,6 +4734,24 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll
index f404eb8be5b..ef22a3ba534 100644
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -136,20 +136,26 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 ; We should ignore a build_vector with more than two sources.
 ; Use illegal <32 x i16> type to produce such a shuffle after legalizing types.
-; Try to look for fallback to stack expansion.
+; Try to look for fallback to by-element inserts.
 define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
 ;CHECK: test_multisource:
-;CHECK: vst1.16
+;CHECK: vmov.16 [[REG:d[0-9]+]][0]
+;CHECK: vmov.16 [[REG]][1]
+;CHECK: vmov.16 [[REG]][2]
+;CHECK: vmov.16 [[REG]][3]
         %tmp1 = load <32 x i16>* %B
         %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
         ret <4 x i16> %tmp2
 }
 
 ; We don't handle shuffles using more than half of a 128-bit vector.
-; Again, test for fallback to stack expansion
+; Again, test for fallback to by-element inserts.
 define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
 ;CHECK: test_largespan:
-;CHECK: vst1.16
+;CHECK: vmov.16 [[REG:d[0-9]+]][0]
+;CHECK: vmov.16 [[REG]][1]
+;CHECK: vmov.16 [[REG]][2]
+;CHECK: vmov.16 [[REG]][3]
         %tmp1 = load <8 x i16>* %B
         %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
         ret <4 x i16> %tmp2
@@ -160,7 +166,14 @@ define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind {
 ; lowering loop can result otherwise).
 define <8 x i16> @test_illegal(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: test_illegal:
-;CHECK: vst1.16
+;CHECK: vmov.16 [[REG:d[0-9]+]][0]
+;CHECK: vmov.16 [[REG]][1]
+;CHECK: vmov.16 [[REG]][2]
+;CHECK: vmov.16 [[REG]][3]
+;CHECK: vmov.16 [[REG2:d[0-9]+]][0]
+;CHECK: vmov.16 [[REG2]][1]
+;CHECK: vmov.16 [[REG2]][2]
+;CHECK: vmov.16 [[REG2]][3]
        %tmp1 = load <8 x i16>* %A
        %tmp2 = load <8 x i16>* %B
        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 7, i32 5, i32 13, i32 3, i32 2, i32 2, i32 9>
diff --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll
index 7e79d6c68c2..9744f4dde88 100644
--- a/llvm/test/CodeGen/ARM/vselect_imax.ll
+++ b/llvm/test/CodeGen/ARM/vselect_imax.ll
@@ -1,3 +1,4 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -march=arm -mcpu=cortex-a8 | FileCheck %s --check-prefix=COST
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
 ; Make sure that ARM backend with NEON handles vselect.
 
@@ -20,11 +21,8 @@ define void @func_blend10(%T0_10* %loadaddr, %T0_10* %loadaddr2,
   %v0 = load %T0_10* %loadaddr
   %v1 = load %T0_10* %loadaddr2
   %c = icmp slt %T0_10 %v0, %v1
-; CHECK: vst1
-; CHECK: vst1
-; CHECK: vst1
-; CHECK: vst1
-; CHECK: vld
+; CHECK: vbsl
+; CHECK: vbsl
 ; COST: func_blend10
 ; COST: cost of 40 {{.*}} select
   %r = select %T1_10 %c, %T0_10 %v0, %T0_10 %v1
@@ -39,10 +37,8 @@ define void @func_blend14(%T0_14* %loadaddr, %T0_14* %loadaddr2,
   %v0 = load %T0_14* %loadaddr
   %v1 = load %T0_14* %loadaddr2
   %c = icmp slt %T0_14 %v0, %v1
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
+; CHECK: vbsl
+; CHECK: vbsl
 ; COST: func_blend14
 ; COST: cost of 41 {{.*}} select
   %r = select %T1_14 %c, %T0_14 %v0, %T0_14 %v1
@@ -54,13 +50,11 @@ define void @func_blend14(%T0_14* %loadaddr, %T0_14* %loadaddr2,
 ; CHECK: func_blend15:
 define void @func_blend15(%T0_15* %loadaddr, %T0_15* %loadaddr2,
                            %T1_15* %blend, %T0_15* %storeaddr) {
+; CHECK: vbsl
+; CHECK: vbsl
   %v0 = load %T0_15* %loadaddr
   %v1 = load %T0_15* %loadaddr2
   %c = icmp slt %T0_15 %v0, %v1
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
 ; COST: func_blend15
 ; COST: cost of 82 {{.*}} select
   %r = select %T1_15 %c, %T0_15 %v0, %T0_15 %v1
@@ -72,13 +66,11 @@ define void @func_blend15(%T0_15* %loadaddr, %T0_15* %loadaddr2,
 ; CHECK: func_blend18:
 define void @func_blend18(%T0_18* %loadaddr, %T0_18* %loadaddr2,
                            %T1_18* %blend, %T0_18* %storeaddr) {
+; CHECK: vbsl
+; CHECK: vbsl
   %v0 = load %T0_18* %loadaddr
   %v1 = load %T0_18* %loadaddr2
   %c = icmp slt %T0_18 %v0, %v1
-; CHECK: strh
-; CHECK: strh
-; CHECK: strh
-; CHECK: strh
 ; COST: func_blend18
 ; COST: cost of 19 {{.*}} select
   %r = select %T1_18 %c, %T0_18 %v0, %T0_18 %v1
@@ -90,13 +82,13 @@ define void @func_blend18(%T0_18* %loadaddr, %T0_18* %loadaddr2,
 ; CHECK: func_blend19:
 define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
                            %T1_19* %blend, %T0_19* %storeaddr) {
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
   %v0 = load %T0_19* %loadaddr
   %v1 = load %T0_19* %loadaddr2
   %c = icmp slt %T0_19 %v0, %v1
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
 ; COST: func_blend19
 ; COST: cost of 50 {{.*}} select
   %r = select %T1_19 %c, %T0_19 %v0, %T0_19 %v1
@@ -108,13 +100,17 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
 ; CHECK: func_blend20:
 define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2,
                            %T1_20* %blend, %T0_20* %storeaddr) {
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
+; CHECK: vbsl
   %v0 = load %T0_20* %loadaddr
   %v1 = load %T0_20* %loadaddr2
   %c = icmp slt %T0_20 %v0, %v1
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
 ; COST: func_blend20
 ; COST: cost of 100 {{.*}} select
   %r = select %T1_20 %c, %T0_20 %v0, %T0_20 %v1
author	Jim Grosbach <grosbach@apple.com>	2013-07-08 18:18:52 +0000
committer	Jim Grosbach <grosbach@apple.com>	2013-07-08 18:18:52 +0000
commit	24e102a947d169b6415627d5d0a547965355a18a (patch)
tree	28045b86bf5342b3954b49e4ca14fc23cfaddabf
parent	4b2967ff9f6ffeb41b3965d396f469d3e95ee74e (diff)
download	bcm5719-llvm-24e102a947d169b6415627d5d0a547965355a18a.tar.gz bcm5719-llvm-24e102a947d169b6415627d5d0a547965355a18a.zip