[ARM] Add ARMISD::VLD1DUP to match vld1_dup more consistently.

Currently, there are substantial problems forming vld1_dup even if the VDUP survives legalization. The lack of an actual node leads to terrible results: not only can we not form post-increment vld1_dup instructions, but we form scalar pre-increment and post-increment loads which force the loaded value into a GPR. This patch fixes that by combining the vdup+load into an ARMISD node before DAGCombine messes it up. Also includes a crash fix for vld2_dup (see testcase @vld2dupi8_postinc_variable). Recommiting with fix to avoid forming vld1dup if the type of the load doesn't match the type of the vdup (see https://llvm.org/bugs/show_bug.cgi?id=31404). Differential Revision: https://reviews.llvm.org/D27694 llvm-svn: 289972
author: Eli Friedman <efriedma@codeaurora.org> 2016-12-16 18:44:08 +0000
committer: Eli Friedman <efriedma@codeaurora.org> 2016-12-16 18:44:08 +0000
commit: f624ec27b75701f1227eb03a44f3da84920c798d (patch)
tree: d10b422365e2b461b34e1a53f1f29a72cb5b5b00 /llvm/test
parent: 79b4f0ad9cc5b019cf71bac388f1da1de4dd4e34 (diff)
download: bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.tar.gz
bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.zip
2 files changed, 223 insertions, 6 deletions
diff --git a/llvm/test/CodeGen/ARM/vlddup.ll b/llvm/test/CodeGen/ARM/vlddup.ll
index c115a3863d0..c6d5747f350 100644
--- a/llvm/test/CodeGen/ARM/vlddup.ll
+++ b/llvm/test/CodeGen/ARM/vlddup.ll
@@ -10,6 +10,84 @@ define <8 x i8> @vld1dupi8(i8* %A) nounwind {
         ret <8 x i8> %tmp3
 }
 
+define <8 x i8> @vld1dupi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind {
+entry:
+;CHECK-LABEL: vld1dupi8_preinc:
+;CHECK: vld1.8 {d16[]}, [r1]
+  %0 = load i8*, i8** %a, align 4
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
+  %1 = load i8, i8* %add.ptr, align 1
+  %2 = insertelement <8 x i8> undef, i8 %1, i32 0
+  %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  store i8* %add.ptr, i8** %a, align 4
+  ret <8 x i8> %lane
+}
+
+define <8 x i8> @vld1dupi8_postinc_fixed(i8** noalias nocapture %a) nounwind {
+entry:
+;CHECK-LABEL: vld1dupi8_postinc_fixed:
+;CHECK: vld1.8 {d16[]}, [r1]!
+  %0 = load i8*, i8** %a, align 4
+  %1 = load i8, i8* %0, align 1
+  %2 = insertelement <8 x i8> undef, i8 %1, i32 0
+  %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 1
+  store i8* %add.ptr, i8** %a, align 4
+  ret <8 x i8> %lane
+}
+
+define <8 x i8> @vld1dupi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind {
+entry:
+;CHECK-LABEL: vld1dupi8_postinc_register:
+;CHECK: vld1.8 {d16[]}, [r2], r1
+  %0 = load i8*, i8** %a, align 4
+  %1 = load i8, i8* %0, align 1
+  %2 = insertelement <8 x i8> undef, i8 %1, i32 0
+  %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
+  store i8* %add.ptr, i8** %a, align 4
+  ret <8 x i8> %lane
+}
+
+define <16 x i8> @vld1dupqi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind {
+entry:
+;CHECK-LABEL: vld1dupqi8_preinc:
+;CHECK: vld1.8 {d16[], d17[]}, [r1]
+  %0 = load i8*, i8** %a, align 4
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
+  %1 = load i8, i8* %add.ptr, align 1
+  %2 = insertelement <16 x i8> undef, i8 %1, i32 0
+  %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  store i8* %add.ptr, i8** %a, align 4
+  ret <16 x i8> %lane
+}
+
+define <16 x i8> @vld1dupqi8_postinc_fixed(i8** noalias nocapture %a) nounwind {
+entry:
+;CHECK-LABEL: vld1dupqi8_postinc_fixed:
+;CHECK: vld1.8 {d16[], d17[]}, [r1]!
+  %0 = load i8*, i8** %a, align 4
+  %1 = load i8, i8* %0, align 1
+  %2 = insertelement <16 x i8> undef, i8 %1, i32 0
+  %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 1
+  store i8* %add.ptr, i8** %a, align 4
+  ret <16 x i8> %lane
+}
+
+define <16 x i8> @vld1dupqi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind {
+entry:
+;CHECK-LABEL: vld1dupqi8_postinc_register:
+;CHECK: vld1.8 {d16[], d17[]}, [r2], r1
+  %0 = load i8*, i8** %a, align 4
+  %1 = load i8, i8* %0, align 1
+  %2 = insertelement <16 x i8> undef, i8 %1, i32 0
+  %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
+  store i8* %add.ptr, i8** %a, align 4
+  ret <16 x i8> %lane
+}
+
 define <4 x i16> @vld1dupi16(i16* %A) nounwind {
 ;CHECK-LABEL: vld1dupi16:
 ;Check the alignment value.  Max for this instruction is 16 bits:
@@ -20,6 +98,51 @@ define <4 x i16> @vld1dupi16(i16* %A) nounwind {
         ret <4 x i16> %tmp3
 }
 
+define <4 x i16> @vld1dupi16_misaligned(i16* %A) nounwind {
+;CHECK-LABEL: vld1dupi16_misaligned:
+;CHECK: vld1.16 {d16[]}, [r0]
+	%tmp1 = load i16, i16* %A, align 1
+	%tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
+	%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
+        ret <4 x i16> %tmp3
+}
+
+; This sort of looks like a vld1dup, but there's an extension in the way.
+define <4 x i16> @load_i16_dup_zext(i8* %A) nounwind {
+;CHECK-LABEL: load_i16_dup_zext:
+;CHECK: ldrb    r0, [r0]
+;CHECK-NEXT: vdup.16 d16, r0
+        %tmp1 = load i8, i8* %A, align 1
+        %tmp2 = zext i8 %tmp1 to i16
+        %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0
+        %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+        ret <4 x i16> %tmp4
+}
+
+; This sort of looks like a vld1dup, but there's an extension in the way.
+define <4 x i16> @load_i16_dup_sext(i8* %A) nounwind {
+;CHECK-LABEL: load_i16_dup_sext:
+;CHECK: ldrsb    r0, [r0]
+;CHECK-NEXT: vdup.16 d16, r0
+        %tmp1 = load i8, i8* %A, align 1
+        %tmp2 = sext i8 %tmp1 to i16
+        %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0
+        %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+        ret <4 x i16> %tmp4
+}
+
+; This sort of looks like a vld1dup, but there's an extension in the way.
+define <8 x i16> @load_i16_dupq_zext(i8* %A) nounwind {
+;CHECK-LABEL: load_i16_dupq_zext:
+;CHECK: ldrb    r0, [r0]
+;CHECK-NEXT: vdup.16 q8, r0
+        %tmp1 = load i8, i8* %A, align 1
+        %tmp2 = zext i8 %tmp1 to i16
+        %tmp3 = insertelement <8 x i16> undef, i16 %tmp2, i32 0
+        %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer
+        ret <8 x i16> %tmp4
+}
+
 define <2 x i32> @vld1dupi32(i32* %A) nounwind {
 ;CHECK-LABEL: vld1dupi32:
 ;Check the alignment value.  Max for this instruction is 32 bits:
@@ -30,6 +153,30 @@ define <2 x i32> @vld1dupi32(i32* %A) nounwind {
         ret <2 x i32> %tmp3
 }
 
+; This sort of looks like a vld1dup, but there's an extension in the way.
+define <4 x i32> @load_i32_dup_zext(i8* %A) nounwind {
+;CHECK-LABEL: load_i32_dup_zext:
+;CHECK: ldrb    r0, [r0]
+;CHECK-NEXT: vdup.32 q8, r0
+        %tmp1 = load i8, i8* %A, align 1
+        %tmp2 = zext i8 %tmp1 to i32
+        %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0
+        %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer
+        ret <4 x i32> %tmp4
+}
+
+; This sort of looks like a vld1dup, but there's an extension in the way.
+define <4 x i32> @load_i32_dup_sext(i8* %A) nounwind {
+;CHECK-LABEL: load_i32_dup_sext:
+;CHECK: ldrsb    r0, [r0]
+;CHECK-NEXT: vdup.32 q8, r0
+        %tmp1 = load i8, i8* %A, align 1
+        %tmp2 = sext i8 %tmp1 to i32
+        %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0
+        %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer
+        ret <4 x i32> %tmp4
+}
+
 define <2 x float> @vld1dupf(float* %A) nounwind {
 ;CHECK-LABEL: vld1dupf:
 ;CHECK: vld1.32 {d16[]}, [r0:32]
@@ -75,6 +222,63 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
         ret <8 x i8> %tmp5
 }
 
+define void @vld2dupi8_preinc(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %b) nounwind {
+;CHECK-LABEL: vld2dupi8_preinc:
+;CHECK: vld2.8 {d16[], d17[]}, [r2]
+entry:
+  %0 = load i8*, i8** %a, align 4
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b
+  %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  store i8* %add.ptr, i8** %a, align 4
+  %r8 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
+  store <8 x i8> %lane, <8 x i8>* %r8, align 8
+  %r11 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
+  store <8 x i8> %lane1, <8 x i8>* %r11, align 8
+  ret void
+}
+
+define void @vld2dupi8_postinc_fixed(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a) nounwind {
+entry:
+;CHECK-LABEL: vld2dupi8_postinc_fixed:
+;CHECK: vld2.8 {d16[], d17[]}, [r2]!
+  %0 = load i8*, i8** %a, align 4
+  %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 2
+  store i8* %add.ptr, i8** %a, align 4
+  %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
+  store <8 x i8> %lane, <8 x i8>* %r7, align 8
+  %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
+  store <8 x i8> %lane1, <8 x i8>* %r10, align 8
+  ret void
+}
+
+define void @vld2dupi8_postinc_variable(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %n) nounwind {
+entry:
+;CHECK-LABEL: vld2dupi8_postinc_variable:
+;CHECK: vld2.8 {d16[], d17[]}, [r3], r2
+  %0 = load i8*, i8** %a, align 4
+  %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n
+  store i8* %add.ptr, i8** %a, align 4
+  %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0
+  store <8 x i8> %lane, <8 x i8>* %r7, align 8
+  %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1
+  store <8 x i8> %lane1, <8 x i8>* %r10, align 8
+  ret void
+}
+
 define <4 x i16> @vld2dupi16(i8* %A) nounwind {
 ;CHECK-LABEL: vld2dupi16:
 ;Check that a power-of-two alignment smaller than the total size of the memory
diff --git a/llvm/test/CodeGen/ARM/vmul.ll b/llvm/test/CodeGen/ARM/vmul.ll
index 0455190b4c9..fcffe175e2b 100644
--- a/llvm/test/CodeGen/ARM/vmul.ll
+++ b/llvm/test/CodeGen/ARM/vmul.ll
@@ -635,13 +635,26 @@ entry:
   ret void
 }
 
-define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
-;   Look for doing a normal scalar FP load rather than an to-all-lanes load.
-;   e.g., "ldr s0, [r2]" rathern than "vld1.32  {d18[], d19[]}, [r2:32]"
-;   Then check that the vector multiply has folded the splat to all lanes
-;   and used a vector * scalar instruction.
-; CHECK: vldr  {{s[0-9]+}}, [r2]
+define void @fmul_splat(<4 x float> * %a, <4 x float>* nocapture %dst, float %tmp) nounwind {
+; Look for a scalar float rather than a splat, then a vector*scalar multiply.
+; CHECK: vmov s0, r2
 ; CHECK: vmul.f32  q8, q8, d0[0]
+  %tmp5 = load <4 x float>, <4 x float>* %a, align 4
+  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
+  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
+  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
+  %tmp10 = fmul <4 x float> %tmp9, %tmp5
+  store <4 x float> %tmp10, <4 x float>* %dst, align 4
+  ret void
+}
+
+define void @fmul_splat_load(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
+; Look for doing a normal scalar FP load rather than an to-all-lanes load,
+; then a vector*scalar multiply.
+; FIXME: Temporarily broken due to splat representation changes.
+; CHECK: vld1.32 {d18[], d19[]}, [r2:32]
+; CHECK: vmul.f32  q8, q9, q8
   %tmp = load float, float* %src, align 4
   %tmp5 = load <4 x float>, <4 x float>* %a, align 4
   %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
author	Eli Friedman <efriedma@codeaurora.org>	2016-12-16 18:44:08 +0000
committer	Eli Friedman <efriedma@codeaurora.org>	2016-12-16 18:44:08 +0000
commit	f624ec27b75701f1227eb03a44f3da84920c798d (patch)
tree	d10b422365e2b461b34e1a53f1f29a72cb5b5b00 /llvm/test
parent	79b4f0ad9cc5b019cf71bac388f1da1de4dd4e34 (diff)
download	bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.tar.gz bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.zip