summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2018-07-05 17:01:20 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2018-07-05 17:01:20 +0000
commit29f303799bf211f353f97982a11b8d1dd7f49656 (patch)
treea5d1596b6eadb948a21073277f423134af88095e /llvm/test
parent5ba72667619935ab33f8021e6132b8c7c03aa143 (diff)
downloadbcm5719-llvm-29f303799bf211f353f97982a11b8d1dd7f49656.tar.gz
bcm5719-llvm-29f303799bf211f353f97982a11b8d1dd7f49656.zip
AMDGPU/GlobalISel: Implement custom kernel arg lowering
Avoid using allocateKernArg / AssignFn. We do not want any of the type splitting properties of normal calling convention lowering. For now at least this exists alongside the IR argument lowering pass. This is necessary to handle struct padding correctly while some arguments are still skipped by the IR argument lowering pass. llvm-svn: 336373
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll723
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-args.ll86
2 files changed, 789 insertions, 20 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
new file mode 100644
index 00000000000..5756ff52b83
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -0,0 +1,723 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; REQUIRES: global-isel
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA-VI %s
+
+define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+ ; HSA-VI-LABEL: name: i8_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
+ ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = zext i8 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+ ; HSA-VI-LABEL: name: i8_zext_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
+ ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = zext i8 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+ ; HSA-VI-LABEL: name: i8_sext_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
+ ; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = sext i8 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+ ; HSA-VI-LABEL: name: i16_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
+ ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = zext i16 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+ ; HSA-VI-LABEL: name: i16_zext_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
+ ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = zext i16 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+ ; HSA-VI-LABEL: name: i16_sext_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
+ ; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = sext i16 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+ ; HSA-VI-LABEL: name: i32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store i32 %in, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+ ; HSA-VI-LABEL: name: f32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `float addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `float addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store float %in, float addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+ ; HSA-VI-LABEL: name: v2i8_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `<2 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store 2 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <2 x i8> %in, <2 x i8> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+ ; HSA-VI-LABEL: name: v2i16_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<2 x i16> addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <2 x i16> %in, <2 x i16> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+ ; HSA-VI-LABEL: name: v2i32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+ ; HSA-VI-LABEL: name: v2f32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+ ; HSA-VI-LABEL: name: v3i8_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 3 from `<3 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store 3 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+ ; HSA-VI-LABEL: name: v3i16_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 6 from `<3 x i16> addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store 6 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+ ; HSA-VI-LABEL: name: v3i32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+ ; HSA-VI-LABEL: name: v3f32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x float> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+ ; HSA-VI-LABEL: name: v4i8_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<4 x i8> addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+ ; HSA-VI-LABEL: name: v4i16_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <4 x i16> %in, <4 x i16> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+ ; HSA-VI-LABEL: name: v4i32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+ ; HSA-VI-LABEL: name: v4f32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x float> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+ ; HSA-VI-LABEL: name: v8i8_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <8 x i8> %in, <8 x i8> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+ ; HSA-VI-LABEL: name: v8i16_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<8 x i16> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <8 x i16> %in, <8 x i16> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+ ; HSA-VI-LABEL: name: v8i32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+ ; HSA-VI-LABEL: name: v8f32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x float> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+ ; HSA-VI-LABEL: name: v16i8_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<16 x i8> addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <16 x i8> %in, <16 x i8> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+ ; HSA-VI-LABEL: name: v16i16_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<16 x i16> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store 32 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <16 x i16> %in, <16 x i16> addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+ ; HSA-VI-LABEL: name: v16i32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x i32> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+ ; HSA-VI-LABEL: name: v16f32_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x float> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+ ; HSA-VI-LABEL: name: kernel_arg_i64
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ store i64 %a, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
+ ; HSA-VI-LABEL: name: f64_kernel_arg
+ ; HSA-VI: bb.1.entry:
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `double addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `double addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+entry:
+ store double %in, double addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+ ; HSA-VI-LABEL: name: i1_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i1 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store 1 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ store i1 %x, i1 addrspace(1)* %out, align 1
+ ret void
+}
+
+define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+ ; HSA-VI-LABEL: name: i1_arg_zext_i32
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
+ ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = zext i1 %x to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+ ; HSA-VI-LABEL: name: i1_arg_zext_i64
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
+ ; HSA-VI: G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = zext i1 %x to i64
+ store i64 %ext, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+ ; HSA-VI-LABEL: name: i1_arg_sext_i32
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
+ ; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = sext i1 %x to i32
+ store i32 %ext, i32addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+ ; HSA-VI-LABEL: name: i1_arg_sext_i64
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
+ ; HSA-VI: G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
+ ; HSA-VI: S_ENDPGM
+ %ext = sext i1 %x to i64
+ store i64 %ext, i64 addrspace(1)* %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+ ; HSA-VI-LABEL: name: empty_struct_arg
+ ; HSA-VI: bb.1 (%ir-block.0):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: S_ENDPGM
+ ret void
+}
+
+; The correct load offsets for these:
+; load 4 from 0,
+; load 8 from 8
+; load 4 from 24
+; load 8 from 32
+
+; With the SelectionDAG argument lowering, the alignments for the
+; struct members is not properly considered, making these wrong.
+
+; FIXME: GlobalISel extractvalue emission broken
+
+define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+ ; %val0 = extractvalue {i32, i64} %arg0, 0
+ ; %val1 = extractvalue {i32, i64} %arg0, 1
+ ; %val2 = extractvalue {i32, i64} %arg1, 0
+ ; %val3 = extractvalue {i32, i64} %arg1, 1
+ ; store volatile i32 %val0, i32 addrspace(1)* null
+ ; store volatile i64 %val1, i64 addrspace(1)* null
+ ; store volatile i32 %val2, i32 addrspace(1)* null
+ ; store volatile i64 %val3, i64 addrspace(1)* null
+ ; HSA-VI-LABEL: name: struct_argument_alignment
+ ; HSA-VI: bb.1 (%ir-block.1):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+ ; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
+ ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, align 8, addrspace 4)
+ ; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s128), 0
+ ; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s128), 64
+ ; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s128), 0
+ ; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s128), 64
+ ; HSA-VI: S_ENDPGM
+ ret void
+}
+
+; No padding between i8 and next struct, but round up at end to 4 byte
+; multiple.
+define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+ ; %val0 = extractvalue <{i32, i64}> %arg0, 0
+ ; %val1 = extractvalue <{i32, i64}> %arg0, 1
+ ; %val2 = extractvalue <{i32, i64}> %arg1, 0
+ ; %val3 = extractvalue <{i32, i64}> %arg1, 1
+ ; store volatile i32 %val0, i32 addrspace(1)* null
+ ; store volatile i64 %val1, i64 addrspace(1)* null
+ ; store volatile i32 %val2, i32 addrspace(1)* null
+ ; store volatile i64 %val3, i64 addrspace(1)* null
+ ; HSA-VI-LABEL: name: packed_struct_argument_alignment
+ ; HSA-VI: bb.1 (%ir-block.1):
+ ; HSA-VI: liveins: $sgpr4_sgpr5
+ ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
+ ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
+ ; HSA-VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 16, addrspace 4)
+ ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+ ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
+ ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 4, addrspace 4)
+ ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
+ ; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
+ ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s96) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 1, addrspace 4)
+ ; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s96), 0
+ ; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s96), 32
+ ; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s96), 0
+ ; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s96), 32
+ ; HSA-VI: S_ENDPGM
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index da8c994c530..3abd0ebbff3 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -14,12 +14,9 @@
; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
-
-
define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
-entry:
- %0 = zext i8 %in to i32
- store i32 %0, i32 addrspace(1)* %out, align 4
+ %ext = zext i8 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@@ -33,9 +30,8 @@ entry:
; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
-entry:
- %0 = zext i8 %in to i32
- store i32 %0, i32 addrspace(1)* %out, align 4
+ %ext = zext i8 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@@ -51,9 +47,8 @@ entry:
; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
-entry:
- %0 = sext i8 %in to i32
- store i32 %0, i32 addrspace(1)* %out, align 4
+ %ext = sext i8 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@@ -71,9 +66,8 @@ entry:
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
-entry:
- %0 = zext i16 %in to i32
- store i32 %0, i32 addrspace(1)* %out, align 4
+ %ext = zext i16 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@@ -89,9 +83,8 @@ entry:
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
-entry:
- %0 = zext i16 %in to i32
- store i32 %0, i32 addrspace(1)* %out, align 4
+ %ext = zext i16 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@@ -108,9 +101,8 @@ entry:
; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
-entry:
- %0 = sext i16 %in to i32
- store i32 %0, i32 addrspace(1)* %out, align 4
+ %ext = sext i16 %in to i32
+ store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@@ -657,3 +649,57 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
+
+; FUNC-LABEL: {{^}}empty_struct_arg:
+; HSA: kernarg_segment_byte_size = 0
+define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+ ret void
+}
+
+; The correct load offsets for these:
+; load 4 from 0,
+; load 8 from 8
+; load 4 from 24
+; load 8 from 32
+
+; With the SelectionDAG argument lowering, the alignments for the
+; struct members is not properly considered, making these wrong.
+
+; FIXME: Total argument size is computed wrong
+; FUNC-LABEL: {{^}}struct_argument_alignment:
+; HSA: kernarg_segment_byte_size = 40
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+ %val0 = extractvalue {i32, i64} %arg0, 0
+ %val1 = extractvalue {i32, i64} %arg0, 1
+ %val2 = extractvalue {i32, i64} %arg1, 0
+ %val3 = extractvalue {i32, i64} %arg1, 1
+ store volatile i32 %val0, i32 addrspace(1)* null
+ store volatile i64 %val1, i64 addrspace(1)* null
+ store volatile i32 %val2, i32 addrspace(1)* null
+ store volatile i64 %val3, i64 addrspace(1)* null
+ ret void
+}
+
+; No padding between i8 and next struct, but round up at end to 4 byte
+; multiple.
+; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
+; HSA: kernarg_segment_byte_size = 28
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
+define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+ %val0 = extractvalue <{i32, i64}> %arg0, 0
+ %val1 = extractvalue <{i32, i64}> %arg0, 1
+ %val2 = extractvalue <{i32, i64}> %arg1, 0
+ %val3 = extractvalue <{i32, i64}> %arg1, 1
+ store volatile i32 %val0, i32 addrspace(1)* null
+ store volatile i64 %val1, i64 addrspace(1)* null
+ store volatile i32 %val2, i32 addrspace(1)* null
+ store volatile i64 %val3, i64 addrspace(1)* null
+ ret void
+}
OpenPOWER on IntegriCloud