diff options
author | Artem Belevich <tra@google.com> | 2017-02-21 22:56:05 +0000 |
---|---|---|
committer | Artem Belevich <tra@google.com> | 2017-02-21 22:56:05 +0000 |
commit | 29bbdc1c32498459bb87d26dde267f5c3ec06603 (patch) | |
tree | e0b55471270c001fa99fdf17c9d91fc3d8d62950 /llvm/test | |
parent | 7d6b71db4f7297940f0cf7787c0e2ad2ddb41252 (diff) | |
download | bcm5719-llvm-29bbdc1c32498459bb87d26dde267f5c3ec06603.tar.gz bcm5719-llvm-29bbdc1c32498459bb87d26dde267f5c3ec06603.zip |
[NVPTX] Unify vectorization of load/stores of aggregate arguments and return values.
Original code only used vector loads/stores for explicit vector arguments.
It could also do more loads/stores than necessary (e.g v5f32 would
touch 8 f32 values). Aggregate types were loaded one element at a time,
even the vectors contained within.
This change attempts to generalize (and simplify) parameter space
loads/stores so that vector loads/stores can be used more broadly.
Functionality of the patch has been verified by compiling thrust
test suite and manually checking the differences between PTX
generated by llvm with and without the patch.
General algorithm:
* ComputePTXValueVTs() flattens input/output argument into a flat list
of scalars to load/store and returns their types and offsets.
* VectorizePTXValueVTs() uses that data to create vectorization plan
which returns an array of flags marking boundaries of vectorized
load/stores. Scalars are represented as 1-element vectors.
* Code that generates loads/stores implements a simple state machine
that constructs a vector according to the plan.
Differential Revision: https://reviews.llvm.org/D30011
llvm-svn: 295784
Diffstat (limited to 'llvm/test')
-rw-r--r-- | llvm/test/CodeGen/NVPTX/aggregate-return.ll | 35 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/f16-instructions.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/ldparam-v4.ll | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll | 27 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/param-load-store.ll | 813 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/vec-param-load.ll | 83 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/vec8.ll | 13 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/vector-call.ll | 22 |
8 files changed, 964 insertions, 36 deletions
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index 527c5c9aa85..785b4d6d90d 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -1,21 +1,40 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s declare <2 x float> @barv(<2 x float> %input) +declare <3 x float> @barv3(<3 x float> %input) declare [2 x float] @bara([2 x float] %input) declare {float, float} @bars({float, float} %input) -define void @foov(<2 x float> %input, <2 x float>* %output) { -; CHECK-LABEL: @foov +define void @test_v2f32(<2 x float> %input, <2 x float>* %output) { +; CHECK-LABEL: @test_v2f32 %call = tail call <2 x float> @barv(<2 x float> %input) ; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; store <2 x float> %call, <2 x float>* %output, align 8 -; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]} +; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void } -define void @fooa([2 x float] %input, [2 x float]* %output) { -; CHECK-LABEL: @fooa +define void @test_v3f32(<3 x float> %input, <3 x float>* %output) { +; CHECK-LABEL: @test_v3f32 +; + %call = tail call <3 x float> @barv3(<3 x float> %input) +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8]; +; Make sure we don't load more values than than we need to. +; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12]; + store <3 x float> %call, <3 x float>* %output, align 8 +; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8], +; -- This is suboptimal. We should do st.v2.f32 instead +; of combining 2xf32 info i64. +; CHECK-DAG: st.u64 [{{%rd[0-9]}}], +; CHECK: ret; + ret void +} + +define void @test_a2f32([2 x float] %input, [2 x float]* %output) { +; CHECK-LABEL: @test_a2f32 %call = tail call [2 x float] @bara([2 x float] %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0]; @@ -28,8 +47,8 @@ define void @fooa([2 x float] %input, [2 x float]* %output) { ; CHECK: ret } -define void @foos({float, float} %input, {float, float}* %output) { -; CHECK-LABEL: @foos +define void @test_s2f32({float, float} %input, {float, float}* %output) { +; CHECK-LABEL: @test_s2f32 %call = tail call {float, float} @bars({float, float} %input) ; CHECK: .param .align 4 .b8 retval0[8]; ; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0]; diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index b94fd17e91f..5fce2961127 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -229,7 +229,7 @@ define half @test_tailcall_flipped(half %a, half %b) #0 { ; CHECK-LABEL: test_select( ; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0]; ; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1]; -; CHECK: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; ; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; ; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index ec306aafe85..4d082f6e9a5 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -2,8 +2,11 @@ declare <4 x float> @bar() +; CHECK-LABEL: .func foo( define void @foo(<4 x float>* %ptr) { -; CHECK: ld.param.v4.f32 +; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0]; +; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0]; +; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]} %val = tail call <4 x float> @bar() store <4 x float> %val, <4 x float>* %ptr ret void diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll index c487c9bab4b..f593969f2a4 100644 --- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to @@ -27,9 +27,9 @@ entry: ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.u8 %rs[[REG:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -45,9 +45,9 @@ entry: ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]] ; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { @@ -78,12 +78,13 @@ entry: ; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]] ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller( -; PTX: ld.param.u8 %rs[[REG:[0-9]+]] +; PTX: ld.param.u32 %r[[C:[0-9]+]] +; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]]; ; PTX: LBB[[LABEL:[_0-9]+]]: ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] -; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 -; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd -; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 +; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; PTX: @%p[[PRED]] bra LBB[[LABEL]] } define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 { @@ -118,7 +119,7 @@ entry: ; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]] ; -- this is the backwards copying BB ; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]] -; PTX: add.s64 %rd[[N]], %rd[[N]], -1 +; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1 ; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]] ; -- this is the forwards copying BB @@ -126,7 +127,7 @@ entry: ; PTX: @%p[[NEQ0]] bra LBB[[EXIT]] ; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]] ; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]] -; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1 +; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1 ; -- exit block ; PTX: LBB[[EXIT]]: ; PTX-NEXT: st.param.b64 [func_retval0 diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll new file mode 100644 index 00000000000..a6defdcb478 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -0,0 +1,813 @@ +; Verifies correctness of load/store of parameters and return values. +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s + +%s_i1 = type { i1 } +%s_i8 = type { i8 } +%s_i16 = type { i16 } +%s_half = type { half } +%s_i32 = type { i32 } +%s_float = type { float } +%s_i64 = type { i64 } +%s_f64 = type { double } + +; More complicated types. i64 is used to increase natural alignment +; requirement for the type. +%s_i32x4 = type { i32, i32, i32, i32, i64} +%s_i32f32 = type { i32, float, i32, float, i64} +%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64} +%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}> +%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]} +; All scalar parameters must be at least 32 bits in size. +; i1 is loaded/stored as i8. + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1( +; CHECK-NEXT: .param .b32 test_i1_param_0 +; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]] +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK-NEXT: test_i1, +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i1 @test_i1(i1 %a) { + %r = tail call i1 @test_i1(i1 %a); + ret i1 %r; +} + +; Signed i1 is a somewhat special case. We only care about one bit and +; then us neg.s32 to convert it to 32-bit -1 if it's set. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1s( +; CHECK-NEXT: .param .b32 test_i1s_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; +; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; +; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i1 @test_i1s(i1 signext %a) { + %r = tail call signext i1 @test_i1s(i1 signext %a); + ret i1 %r; +} + +; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i1( +; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i1, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i1> @test_v3i1(<3 x i1> %a) { + %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); + ret <3 x i1> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i1( +; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK: test_v4i1, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; +; CHECK-NEXT: ret; +define <4 x i1> @test_v4i1(<4 x i1> %a) { + %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); + ret <4 x i1> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i1( +; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i1, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i1> @test_v5i1(<5 x i1> %a) { + %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); + ret <5 x i1> %r; +} + +; Unsigned i8 is loaded directly into 32-bit register. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8( +; CHECK-NEXT: .param .b32 test_i8_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i8 @test_i8(i8 %a) { + %r = tail call i8 @test_i8(i8 %a); + ret i8 %r; +} + +; signed i8 is loaded into 16-bit register which is then sign-extended to i32. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8s( +; CHECK-NEXT: .param .b32 test_i8s_param_0 +; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; +; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8s, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? +; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i8 @test_i8s(i8 signext %a) { + %r = tail call signext i8 @test_i8s(i8 signext %a); + ret i8 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i8( +; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i8, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i8> @test_v3i8(<3 x i8> %a) { + %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); + ret <3 x i8> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i8( +; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i8, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i8> @test_v4i8(<4 x i8> %a) { + %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); + ret <4 x i8> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i8( +; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; +; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i8, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i8> @test_v5i8(<5 x i8> %a) { + %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); + ret <5 x i8> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16( +; CHECK-NEXT: .param .b32 test_i16_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; +; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i16 @test_i16(i16 %a) { + %r = tail call i16 @test_i16(i16 %a); + ret i16 %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16s( +; CHECK-NEXT: .param .b32 test_i16s_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; +; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16s, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i16 @test_i16s(i16 signext %a) { + %r = tail call signext i16 @test_i16s(i16 signext %a); + ret i16 %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3i16( +; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] +; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; +; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i16, +; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i16> @test_v3i16(<3 x i16> %a) { + %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); + ret <3 x i16> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4i16( +; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] +; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i16, +; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i16> @test_v4i16(<4 x i16> %a) { + %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); + ret <4 x i16> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5i16( +; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] +; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; +; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i16, +; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i16> @test_v5i16(<5 x i16> %a) { + %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); + ret <5 x i16> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_half( +; CHECK-NEXT: .param .b32 test_half_param_0 +; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_half_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b16 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_half, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define half @test_half(half %a) { + %r = tail call half @test_half(half %a); + ret half %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i32( +; CHECK-NEXT: .param .b32 test_i32_param_0 +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i32 @test_i32(i32 %a) { + %r = tail call i32 @test_i32(i32 %a); + ret i32 %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v3i32( +; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i32, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i32> @test_v3i32(<3 x i32> %a) { + %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); + ret <3 x i32> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v4i32( +; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] +; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i32, +; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHCK-NEXT: ret; +define <4 x i32> @test_v4i32(<4 x i32> %a) { + %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); + ret <4 x i32> %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v5i32( +; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] +; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; +; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i32, +; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i32> @test_v5i32(<5 x i32> %a) { + %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); + ret <5 x i32> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_float( +; CHECK-NEXT: .param .b32 test_float_param_0 +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_float_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_float, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define float @test_float(float %a) { + %r = tail call float @test_float(float %a); + ret float %r; +} + +; CHECK: .func (.param .b64 func_retval0) +; CHECK-LABEL: test_i64( +; CHECK-NEXT: .param .b64 test_i64_param_0 +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; +; CHECK: .param .b64 param0; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .b64 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i64 @test_i64(i64 %a) { + %r = tail call i64 @test_i64(i64 %a); + ret i64 %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v3i64( +; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] +; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b64 [param0+16], [[E2]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i64> @test_v3i64(<3 x i64> %a) { + %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); + ret <3 x i64> %r; +} + +; For i64 vector loads are limited by PTX to 2 elements. +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v4i64( +; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] +; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-NEXT: ret; +define <4 x i64> @test_v4i64(<4 x i64> %a) { + %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); + ret <4 x i64> %r; +} + +; Aggregates, on the other hand, do not get extended. + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i1( +; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i1, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i1 @test_s_i1(%s_i1 %a) { + %r = tail call %s_i1 @test_s_i1(%s_i1 %a); + ret %s_i1 %r; +} + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i8( +; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i8, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i8 @test_s_i8(%s_i8 %a) { + %r = tail call %s_i8 @test_s_i8(%s_i8 %a); + ret %s_i8 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_i16( +; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] +; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i16, +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i16 @test_s_i16(%s_i16 %a) { + %r = tail call %s_i16 @test_s_i16(%s_i16 %a); + ret %s_i16 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_half( +; CHECK-NEXT: .param .align 2 .b8 test_s_half_param_0[2] +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_half_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_half, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_half @test_s_half(%s_half %a) { + %r = tail call %s_half @test_s_half(%s_half %a); + ret %s_half %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_i32( +; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i32 @test_s_i32(%s_i32 %a) { + %r = tail call %s_i32 @test_s_i32(%s_i32 %a); + ret %s_i32 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_float( +; CHECK-NEXT: .param .align 4 .b8 test_s_float_param_0[4] +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_float_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_float, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_float @test_s_float(%s_float %a) { + %r = tail call %s_float @test_s_float(%s_float %a); + ret %s_float %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_s_i64( +; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i64 @test_s_i64(%s_i64 %a) { + %r = tail call %s_i64 @test_s_i64(%s_i64 %a); + ret %s_i64 %r; +} + +; Fields that have different types, but identical sizes are not vectorized. +; CHECK: .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32f32( +; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] +; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; +; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; +; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; +; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; +; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; +; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32f32, +; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; +; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; +define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { + %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); + ret %s_i32f32 %r; +} + +; We do vectorize consecutive fields with matching types. +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32x4( +; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] +; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; +; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; +; CHECK: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32x4, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; +; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; + +define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { + %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); + ret %s_i32x4 %r; +} + +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i1i32x4( +; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] +; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; +; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; +; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; +; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+8], [[E2]]; +; CHECK: st.param.b32 [param0+12], [[E3]]; +; CHECK: st.param.b32 [param0+16], [[E4]]; +; CHECK: st.param.b64 [param0+24], [[E5]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_s_i1i32x4, +; CHECK: ( +; CHECK: param0 +; CHECK: ); +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; +; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; +; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; +; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; +; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; +; CHECK: ret; + +define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { + %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); + ret %s_i8i32x4 %r; +} + +; -- All loads/stores from parameters aligned by one must be done one +; -- byte at a time. +; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) +; CHECK-LABEL: test_s_i1i32x4p( +; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; +; --- TODO +; --- Unaligned parameter store/ return value load is broken in both nvcc +; --- and llvm and needs to be fixed. +; CHECK: .param .align 1 .b8 param0[25]; +; CHECK-DAG: st.param.b32 [param0+0], +; CHECK-DAG: st.param.b32 [param0+4], +; CHECK-DAG: st.param.b8 [param0+8], +; CHECK-DAG: st.param.b32 [param0+9], +; CHECK-DAG: st.param.b32 [param0+13], +; CHECK-DAG: st.param.b64 [param0+17], +; CHECK: .param .align 1 .b8 retval0[25]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i1i32x4p, +; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; +; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; +; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; +; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; +; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; +; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; +; CHECK-DAG: st.param.b32 [func_retval0+0], +; CHECK-DAG: st.param.b32 [func_retval0+4], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK-DAG: st.param.b32 [func_retval0+9], +; CHECK-DAG: st.param.b32 [func_retval0+13], +; CHECK-DAG: st.param.b64 [func_retval0+17], + +define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { + %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); + ret %s_i8i32x4p %r; +} + +; Check that we can vectorize loads that span multiple aggregate fields. +; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) +; CHECK-LABEL: test_s_crossfield( +; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] +; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; +; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; +; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; +; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; +; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; +; CHECK: .param .align 16 .b8 param0[80]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; +; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; +; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; +; CHECK: st.param.b32 [param0+64], [[E15]]; +; CHECK: .param .align 16 .b8 retval0[80]; +; CHECK: call.uni (retval0), +; CHECK: test_s_crossfield, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; +; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; +; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; +; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; +; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; +; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; +; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; +; CHECK: ret; + +define %s_crossfield @test_s_crossfield(%s_crossfield %a) { + %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); + ret %s_crossfield %r; +} diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index 4193ac4085c..bf26e5ff1bd 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -2,12 +2,81 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - -define <16 x float> @foo(<16 x float> %a) { -; Make sure we index into vectors properly -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16]; -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0]; +define <16 x float> @test_v16f32(<16 x float> %a) { +; CHECK-LABEL: test_v16f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; +; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]} +; CHECK: ret; ret <16 x float> %a } + +define <8 x float> @test_v8f32(<8 x float> %a) { +; CHECK-LABEL: test_v8f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK: ret; + ret <8 x float> %a +} + +define <4 x float> @test_v4f32(<4 x float> %a) { +; CHECK-LABEL: test_v4f32( +; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; +; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <4 x float> %a +} + +define <2 x float> @test_v2f32(<2 x float> %a) { +; CHECK-LABEL: test_v2f32( +; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_3]]} +; CHECK: ret; + ret <2 x float> %a +} + +; Oddly shaped vectors should not load any extra elements. +define <3 x float> @test_v3f32(<3 x float> %a) { +; CHECK-LABEL: test_v3f32( +; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8]; +; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; +; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]] +; CHECK: ret; + ret <3 x float> %a +} + +define <8 x i64> @test_v8i64(<8 x i64> %a) { +; CHECK-LABEL: test_v8i64( +; CHECK-DAG: ld.param.v2.u64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; +; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; +; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[V_0_1]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} +; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]} +; CHECK: ret; + ret <8 x i64> %a +} + +define <16 x i16> @test_v16i16(<16 x i16> %a) { +; CHECK-LABEL: test_v16i16( +; CHECK-DAG: ld.param.v4.u16 {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24]; +; CHECK-DAG: ld.param.v4.u16 {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; +; CHECK-DAG: ld.param.v4.u16 {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8]; +; CHECK-DAG: ld.param.v4.u16 {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+16], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.b16 [func_retval0+24], {[[V_12_15]]} +; CHECK: ret; + ret <16 x i16> %a +} diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll index 03f5cfc6cb0..a86ba1e29d5 100644 --- a/llvm/test/CodeGen/NVPTX/vec8.ll +++ b/llvm/test/CodeGen/NVPTX/vec8.ll @@ -4,10 +4,15 @@ target triple = "nvptx-unknown-cuda" ; CHECK: .visible .func foo define void @foo(<8 x i8> %a, i8* %b) { - %t0 = extractelement <8 x i8> %a, i32 0 -; CHECK-DAG: ld.param.v4.u8 -; CHECK-DAG: ld.param.u32 - store i8 %t0, i8* %b +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] +; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] +; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] +; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK: st.u8 [%[[B]]], [[T]]; + %t0 = extractelement <8 x i8> %a, i32 1 + %t1 = extractelement <8 x i8> %a, i32 6 + %t = add i8 %t0, %t1 + store i8 %t, i8* %b ret void } diff --git a/llvm/test/CodeGen/NVPTX/vector-call.ll b/llvm/test/CodeGen/NVPTX/vector-call.ll index 968d1d4a5f5..bf7b931a575 100644 --- a/llvm/test/CodeGen/NVPTX/vector-call.ll +++ b/llvm/test/CodeGen/NVPTX/vector-call.ll @@ -4,9 +4,27 @@ target triple = "nvptx-unknown-cuda" declare void @bar(<4 x i32>) -; CHECK-LABEL: @foo +; CHECK-LABEL: .func foo( +; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: call.uni +; CHECK: ret; define void @foo(<4 x i32> %a) { -; CHECK: st.param.v4.b32 tail call void @bar(<4 x i32> %a) ret void } + +; CHECK-LABEL: .func foo3( +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK: call.uni +; CHECK: ret; +declare void @bar3(<3 x i32>) +define void @foo3(<3 x i32> %a) { + tail call void @bar3(<3 x i32> %a) + ret void +} |