From ab99b59e8ca28f5366fb95b497e64ae44d67a9ca Mon Sep 17 00:00:00 2001 From: Jeroen Ketema Date: Wed, 30 Sep 2015 10:56:37 +0000 Subject: [ARM][NEON] Use address space in vld([1234]|[234]lane) and vst([1234]|[234]lane) instructions This commit changes the interface of the vld[1234], vld[234]lane, and vst[1234], vst[234]lane ARM neon intrinsics and associates an address space with the pointer that these intrinsics take. This changes, e.g., <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32) to <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32) This change ensures that address spaces are fully taken into account in the ARM target during lowering of interleaved loads and stores. Differential Revision: http://reviews.llvm.org/D12985 llvm-svn: 248887 --- llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll | 24 ++-- llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll | 4 +- .../test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll | 4 +- .../ARM/2010-06-29-PartialRedefFastAlloc.ll | 4 +- .../test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll | 4 +- .../CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll | 10 +- .../CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll | 4 +- .../CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll | 14 +-- llvm/test/CodeGen/ARM/2013-10-11-select-stalls.ll | 6 +- .../ARM/2014-01-09-pseudo_expand_implicit_reg.ll | 4 +- llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll | 21 ++++ llvm/test/CodeGen/ARM/coalesce-subregs.ll | 38 +++--- llvm/test/CodeGen/ARM/dagcombine-concatvector.ll | 4 +- llvm/test/CodeGen/ARM/neon_spill.ll | 6 +- llvm/test/CodeGen/ARM/out-of-registers.ll | 8 +- llvm/test/CodeGen/ARM/reg_sequence.ll | 64 +++++----- llvm/test/CodeGen/ARM/spill-q.ll | 28 ++--- llvm/test/CodeGen/ARM/vcge.ll | 4 +- llvm/test/CodeGen/ARM/vector-DAGCombine.ll | 4 +- llvm/test/CodeGen/ARM/vld-vst-upgrade.ll | 139 +++++++++++++++++++++ llvm/test/CodeGen/ARM/vld1.ll | 52 ++++---- llvm/test/CodeGen/ARM/vld2.ll | 40 +++--- llvm/test/CodeGen/ARM/vld3.ll | 42 +++---- llvm/test/CodeGen/ARM/vld4.ll | 42 +++---- llvm/test/CodeGen/ARM/vlddup.ll | 30 ++--- llvm/test/CodeGen/ARM/vldlane.ll | 92 +++++++------- llvm/test/CodeGen/ARM/vmov.ll | 4 +- llvm/test/CodeGen/ARM/vmul.ll | 14 +-- llvm/test/CodeGen/ARM/vst1.ll | 48 +++---- llvm/test/CodeGen/ARM/vst2.ll | 44 +++---- llvm/test/CodeGen/ARM/vst3.ll | 42 +++---- llvm/test/CodeGen/ARM/vst4.ll | 42 +++---- llvm/test/CodeGen/ARM/vstlane.ll | 90 ++++++------- 33 files changed, 568 insertions(+), 408 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/vld-vst-upgrade.ll (limited to 'llvm/test/CodeGen/ARM') diff --git a/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll index cfaffd8234b..171b6d2bcc5 100644 --- a/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll +++ b/llvm/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll @@ -1,36 +1,36 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null ; This test would crash the rewriter when trying to handle a spill after one of -; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register. +; the @llvm.arm.neon.vld3.v8i8.p0i8 defined three parts of a register. %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } -declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly +declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind { - %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1] %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1] - %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1] %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1] - %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1] - %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1] - %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1] %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1] - %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1] %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1] %tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1] %tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1] %tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1] %tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2] - call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1) + call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1) %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1] %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1] %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1] @@ -38,8 +38,8 @@ define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1] %tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1] %tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2] - call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1) + call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1) %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1] - tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1) + tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1) ret <8 x i8> %tmp4 } diff --git a/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll index 6a6ccf3d0a0..c6c0e2caee4 100644 --- a/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll +++ b/llvm/test/CodeGen/ARM/2010-05-21-BuildVector.ll @@ -36,8 +36,8 @@ entry: %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3 %19 = fmul <4 x float> %tmp5, %2 %20 = bitcast float* %fltp to i8* - tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1) + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %20, <4 x float> %19, i32 1) ret void } -declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind diff --git a/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll b/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll index f86c3ba9ef6..1deb98631a4 100644 --- a/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll +++ b/llvm/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll @@ -12,8 +12,8 @@ entry: %tmp9 = trunc i128 %tmp8 to i64 ; [#uses=1] %tmp16.i = bitcast i64 %tmp6 to <8 x i8> ; <<8 x i8>> [#uses=1] %tmp20.i = bitcast i64 %tmp9 to <8 x i8> ; <<8 x i8>> [#uses=1] - tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind + tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind ret void } -declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind diff --git a/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll b/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll index 1aee5088eee..130221d38c2 100644 --- a/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll +++ b/llvm/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll @@ -16,10 +16,10 @@ target triple = "thumbv7-apple-darwin10" define i32 @test(i8* %arg) nounwind { entry: - %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1) + %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %arg, i32 1) %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> store <2 x i64> %1, <2 x i64>* undef, align 16 ret i32 undef } -declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly +declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly diff --git a/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll b/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll index 3cbc4cdcd70..d702af7c0c7 100644 --- a/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll +++ b/llvm/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll @@ -4,9 +4,9 @@ define void @test_vmovqqqq_pseudo() nounwind ssp { entry: - %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2) + %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2) store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef ret void } -declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly diff --git a/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll b/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll index b70b7f6f3b2..f622ceb584e 100644 --- a/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll +++ b/llvm/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll @@ -52,8 +52,8 @@ cond.end295: ; preds = %entry %shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer %shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32> %2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float> - tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind - tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind unreachable for.end: ; preds = %entry @@ -63,10 +63,10 @@ for.end: ; preds = %entry ; Check that pseudo-expansion preserves flags. define void @foo3(i8* %p) nounwind ssp { entry: - tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4) + tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4) ret void } declare arm_aapcs_vfpcc void @bar(i8*, float, float, float) -declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind -declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind diff --git a/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll index 7f30ae10e43..606af47a3d8 100644 --- a/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll +++ b/llvm/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll @@ -7,8 +7,8 @@ entry: %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1 %0 = bitcast i32* %p to i8* - tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4) + tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4) ret void } -declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind diff --git a/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll index 545bfc73c59..6cff67614c6 100644 --- a/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll +++ b/llvm/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll @@ -5,9 +5,9 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target triple = "thumbv7-apple-ios5.1.0" -declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind define void @findEdges(i8*) nounwind ssp { %2 = icmp sgt i32 undef, 0 @@ -19,16 +19,16 @@ define void @findEdges(i8*) nounwind ssp { ;