diff options
author | Eli Friedman <efriedma@codeaurora.org> | 2016-12-16 18:44:08 +0000 |
---|---|---|
committer | Eli Friedman <efriedma@codeaurora.org> | 2016-12-16 18:44:08 +0000 |
commit | f624ec27b75701f1227eb03a44f3da84920c798d (patch) | |
tree | d10b422365e2b461b34e1a53f1f29a72cb5b5b00 /llvm/test/CodeGen/ARM/vlddup.ll | |
parent | 79b4f0ad9cc5b019cf71bac388f1da1de4dd4e34 (diff) | |
download | bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.tar.gz bcm5719-llvm-f624ec27b75701f1227eb03a44f3da84920c798d.zip |
[ARM] Add ARMISD::VLD1DUP to match vld1_dup more consistently.
Currently, there are substantial problems forming vld1_dup even if the
VDUP survives legalization. The lack of an actual node
leads to terrible results: not only can we not form post-increment vld1_dup
instructions, but we form scalar pre-increment and post-increment
loads which force the loaded value into a GPR. This patch fixes that
by combining the vdup+load into an ARMISD node before DAGCombine
messes it up.
Also includes a crash fix for vld2_dup (see testcase @vld2dupi8_postinc_variable).
Recommiting with fix to avoid forming vld1dup if the type of the load
doesn't match the type of the vdup (see
https://llvm.org/bugs/show_bug.cgi?id=31404).
Differential Revision: https://reviews.llvm.org/D27694
llvm-svn: 289972
Diffstat (limited to 'llvm/test/CodeGen/ARM/vlddup.ll')
-rw-r--r-- | llvm/test/CodeGen/ARM/vlddup.ll | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/ARM/vlddup.ll b/llvm/test/CodeGen/ARM/vlddup.ll index c115a3863d0..c6d5747f350 100644 --- a/llvm/test/CodeGen/ARM/vlddup.ll +++ b/llvm/test/CodeGen/ARM/vlddup.ll @@ -10,6 +10,84 @@ define <8 x i8> @vld1dupi8(i8* %A) nounwind { ret <8 x i8> %tmp3 } +define <8 x i8> @vld1dupi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind { +entry: +;CHECK-LABEL: vld1dupi8_preinc: +;CHECK: vld1.8 {d16[]}, [r1] + %0 = load i8*, i8** %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b + %1 = load i8, i8* %add.ptr, align 1 + %2 = insertelement <8 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + store i8* %add.ptr, i8** %a, align 4 + ret <8 x i8> %lane +} + +define <8 x i8> @vld1dupi8_postinc_fixed(i8** noalias nocapture %a) nounwind { +entry: +;CHECK-LABEL: vld1dupi8_postinc_fixed: +;CHECK: vld1.8 {d16[]}, [r1]! + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <8 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 1 + store i8* %add.ptr, i8** %a, align 4 + ret <8 x i8> %lane +} + +define <8 x i8> @vld1dupi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind { +entry: +;CHECK-LABEL: vld1dupi8_postinc_register: +;CHECK: vld1.8 {d16[]}, [r2], r1 + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <8 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n + store i8* %add.ptr, i8** %a, align 4 + ret <8 x i8> %lane +} + +define <16 x i8> @vld1dupqi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind { +entry: +;CHECK-LABEL: vld1dupqi8_preinc: +;CHECK: vld1.8 {d16[], d17[]}, [r1] + %0 = load i8*, i8** %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b + %1 = load i8, i8* %add.ptr, align 1 + %2 = insertelement <16 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + store i8* %add.ptr, i8** %a, align 4 + ret <16 x i8> %lane +} + +define <16 x i8> @vld1dupqi8_postinc_fixed(i8** noalias nocapture %a) nounwind { +entry: +;CHECK-LABEL: vld1dupqi8_postinc_fixed: +;CHECK: vld1.8 {d16[], d17[]}, [r1]! + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <16 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 1 + store i8* %add.ptr, i8** %a, align 4 + ret <16 x i8> %lane +} + +define <16 x i8> @vld1dupqi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind { +entry: +;CHECK-LABEL: vld1dupqi8_postinc_register: +;CHECK: vld1.8 {d16[], d17[]}, [r2], r1 + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <16 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n + store i8* %add.ptr, i8** %a, align 4 + ret <16 x i8> %lane +} + define <4 x i16> @vld1dupi16(i16* %A) nounwind { ;CHECK-LABEL: vld1dupi16: ;Check the alignment value. Max for this instruction is 16 bits: @@ -20,6 +98,51 @@ define <4 x i16> @vld1dupi16(i16* %A) nounwind { ret <4 x i16> %tmp3 } +define <4 x i16> @vld1dupi16_misaligned(i16* %A) nounwind { +;CHECK-LABEL: vld1dupi16_misaligned: +;CHECK: vld1.16 {d16[]}, [r0] + %tmp1 = load i16, i16* %A, align 1 + %tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0 + %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %tmp3 +} + +; This sort of looks like a vld1dup, but there's an extension in the way. +define <4 x i16> @load_i16_dup_zext(i8* %A) nounwind { +;CHECK-LABEL: load_i16_dup_zext: +;CHECK: ldrb r0, [r0] +;CHECK-NEXT: vdup.16 d16, r0 + %tmp1 = load i8, i8* %A, align 1 + %tmp2 = zext i8 %tmp1 to i16 + %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0 + %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %tmp4 +} + +; This sort of looks like a vld1dup, but there's an extension in the way. +define <4 x i16> @load_i16_dup_sext(i8* %A) nounwind { +;CHECK-LABEL: load_i16_dup_sext: +;CHECK: ldrsb r0, [r0] +;CHECK-NEXT: vdup.16 d16, r0 + %tmp1 = load i8, i8* %A, align 1 + %tmp2 = sext i8 %tmp1 to i16 + %tmp3 = insertelement <4 x i16> undef, i16 %tmp2, i32 0 + %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %tmp4 +} + +; This sort of looks like a vld1dup, but there's an extension in the way. +define <8 x i16> @load_i16_dupq_zext(i8* %A) nounwind { +;CHECK-LABEL: load_i16_dupq_zext: +;CHECK: ldrb r0, [r0] +;CHECK-NEXT: vdup.16 q8, r0 + %tmp1 = load i8, i8* %A, align 1 + %tmp2 = zext i8 %tmp1 to i16 + %tmp3 = insertelement <8 x i16> undef, i16 %tmp2, i32 0 + %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %tmp4 +} + define <2 x i32> @vld1dupi32(i32* %A) nounwind { ;CHECK-LABEL: vld1dupi32: ;Check the alignment value. Max for this instruction is 32 bits: @@ -30,6 +153,30 @@ define <2 x i32> @vld1dupi32(i32* %A) nounwind { ret <2 x i32> %tmp3 } +; This sort of looks like a vld1dup, but there's an extension in the way. +define <4 x i32> @load_i32_dup_zext(i8* %A) nounwind { +;CHECK-LABEL: load_i32_dup_zext: +;CHECK: ldrb r0, [r0] +;CHECK-NEXT: vdup.32 q8, r0 + %tmp1 = load i8, i8* %A, align 1 + %tmp2 = zext i8 %tmp1 to i32 + %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0 + %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %tmp4 +} + +; This sort of looks like a vld1dup, but there's an extension in the way. +define <4 x i32> @load_i32_dup_sext(i8* %A) nounwind { +;CHECK-LABEL: load_i32_dup_sext: +;CHECK: ldrsb r0, [r0] +;CHECK-NEXT: vdup.32 q8, r0 + %tmp1 = load i8, i8* %A, align 1 + %tmp2 = sext i8 %tmp1 to i32 + %tmp3 = insertelement <4 x i32> undef, i32 %tmp2, i32 0 + %tmp4 = shufflevector <4 x i32> %tmp3, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %tmp4 +} + define <2 x float> @vld1dupf(float* %A) nounwind { ;CHECK-LABEL: vld1dupf: ;CHECK: vld1.32 {d16[]}, [r0:32] @@ -75,6 +222,63 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind { ret <8 x i8> %tmp5 } +define void @vld2dupi8_preinc(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %b) nounwind { +;CHECK-LABEL: vld2dupi8_preinc: +;CHECK: vld2.8 {d16[], d17[]}, [r2] +entry: + %0 = load i8*, i8** %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b + %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + store i8* %add.ptr, i8** %a, align 4 + %r8 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 + store <8 x i8> %lane, <8 x i8>* %r8, align 8 + %r11 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 + store <8 x i8> %lane1, <8 x i8>* %r11, align 8 + ret void +} + +define void @vld2dupi8_postinc_fixed(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a) nounwind { +entry: +;CHECK-LABEL: vld2dupi8_postinc_fixed: +;CHECK: vld2.8 {d16[], d17[]}, [r2]! + %0 = load i8*, i8** %a, align 4 + %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 2 + store i8* %add.ptr, i8** %a, align 4 + %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 + store <8 x i8> %lane, <8 x i8>* %r7, align 8 + %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 + store <8 x i8> %lane1, <8 x i8>* %r10, align 8 + ret void +} + +define void @vld2dupi8_postinc_variable(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %n) nounwind { +entry: +;CHECK-LABEL: vld2dupi8_postinc_variable: +;CHECK: vld2.8 {d16[], d17[]}, [r3], r2 + %0 = load i8*, i8** %a, align 4 + %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n + store i8* %add.ptr, i8** %a, align 4 + %r7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 + store <8 x i8> %lane, <8 x i8>* %r7, align 8 + %r10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 + store <8 x i8> %lane1, <8 x i8>* %r10, align 8 + ret void +} + define <4 x i16> @vld2dupi16(i8* %A) nounwind { ;CHECK-LABEL: vld2dupi16: ;Check that a power-of-two alignment smaller than the total size of the memory |