diff options
90 files changed, 1240 insertions, 1249 deletions
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index b3794b2b431..f71cf2f61de 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -270,27 +270,17 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR). .. table:: Address Space Mapping :name: amdgpu-address-space-mapping-table - ================== ================= ================= + ================== ================= LLVM Address Space Memory Space - ------------------ ----------------------------------- - \ Current Default Future Default - ================== ================= ================= - 0 Generic (Flat) Generic (Flat) - 1 Global Global - 2 Constant Region (GDS) - 3 Local (group/LDS) Local (group/LDS) - 4 Region (GDS) Constant - 5 Private (Scratch) Private (Scratch) - 6 Constant 32-bit Constant 32-bit - ================== ================= ================= - -Current Default - This is the current default address space mapping used for all languages. - This will shortly be deprecated. - -Future Default - This will shortly be the only address space mapping for all languages using - AMDGPU backend. + ================== ================= + 0 Generic (Flat) + 1 Global + 2 Region (GDS) + 3 Local (group/LDS) + 4 Constant + 5 Private (Scratch) + 6 Constant 32-bit + ================== ================= .. _amdgpu-memory-scopes: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9d5653344f3..1284939b447 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -83,22 +83,22 @@ defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named def int_amdgcn_dispatch_ptr : GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">, - Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], + Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_queue_ptr : GCCBuiltin<"__builtin_amdgcn_queue_ptr">, - Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], + Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_kernarg_segment_ptr : GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, - Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], + Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_implicitarg_ptr : GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">, - Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], + Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_groupstaticsize : @@ -111,7 +111,7 @@ def int_amdgcn_dispatch_id : def int_amdgcn_implicit_buffer_ptr : GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, - Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], + Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], [IntrNoMem, IntrSpeculatable]>; // Set EXEC to the 64-bit value given. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0b590c3c122..9c68de65655 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -222,7 +222,7 @@ struct AMDGPUAS { MAX_COMMON_ADDRESS = 5, GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index fa52bbb9def..ef4b69d09d9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -61,7 +61,7 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias} }; static const AliasResult ASAliasRulesGenIsZero[6][6] = { - /* Flat Global Constant Group Region Private */ + /* Flat Global Region Group Constant Private */ /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias}, @@ -72,9 +72,9 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar assert(AS.MAX_COMMON_ADDRESS <= 5); if (AS.FLAT_ADDRESS == 0) { assert(AS.GLOBAL_ADDRESS == 1 && - AS.REGION_ADDRESS == 4 && + AS.REGION_ADDRESS == 2 && AS.LOCAL_ADDRESS == 3 && - AS.CONSTANT_ADDRESS == 2 && + AS.CONSTANT_ADDRESS == 4 && AS.PRIVATE_ADDRESS == 5); ASAliasRules = &ASAliasRulesGenIsZero; } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 5a913873193..58e8b687eee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -116,7 +116,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); - const LLT P2 = LLT::pointer(2, 64); + const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); unsigned VReg = MRI.createGenericVirtualRegister(P2); MRI.addLiveIn(InputPtrReg, VReg); MIRBuilder.getMBB().addLiveIn(InputPtrReg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 04faf6e37eb..e2448522c67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -12,6 +12,7 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" @@ -29,8 +30,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { const LLT V2S16 = LLT::vector(2, 16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); - const LLT P1 = LLT::pointer(1, 64); - const LLT P2 = LLT::pointer(2, 64); + const LLT P1 = LLT::pointer(AMDGPUAS::GLOBAL_ADDRESS, 64); + const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); setAction({G_ADD, S32}, Legal); setAction({G_AND, S32}, Legal); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1af1e10dac9..2556451340e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -266,7 +266,7 @@ static StringRef computeDataLayout(const Triple &TT) { // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32" + return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index c3e96f97103..38b72c3321f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -929,7 +929,7 @@ AMDGPUAS getAMDGPUAS(Triple T) { AMDGPUAS AS; AS.FLAT_ADDRESS = 0; AS.PRIVATE_ADDRESS = 5; - AS.REGION_ADDRESS = 4; + AS.REGION_ADDRESS = 2; return AS; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir index 4f4655e2efa..422dcbf3b77 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -5,7 +5,7 @@ # REQUIRES: global-isel --- | - define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void } + define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void } ... --- @@ -91,50 +91,50 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - %0:sgpr(p2) = COPY $sgpr0_sgpr1 + %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4 - %2:sgpr(p2) = G_GEP %0, %1 + %2:sgpr(p4) = G_GEP %0, %1 %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0) $sgpr0 = COPY %3 %4:sgpr(s64) = G_CONSTANT i64 1020 - %5:sgpr(p2) = G_GEP %0, %4 + %5:sgpr(p4) = G_GEP %0, %4 %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0) $sgpr0 = COPY %6 %7:sgpr(s64) = G_CONSTANT i64 1024 - %8:sgpr(p2) = G_GEP %0, %7 + %8:sgpr(p4) = G_GEP %0, %7 %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0) $sgpr0 = COPY %9 %10:sgpr(s64) = G_CONSTANT i64 1048572 - %11:sgpr(p2) = G_GEP %0, %10 + %11:sgpr(p4) = G_GEP %0, %10 %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0) $sgpr0 = COPY %12 %13:sgpr(s64) = G_CONSTANT i64 1048576 - %14:sgpr(p2) = G_GEP %0, %13 + %14:sgpr(p4) = G_GEP %0, %13 %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0) $sgpr0 = COPY %15 %16:sgpr(s64) = G_CONSTANT i64 17179869180 - %17:sgpr(p2) = G_GEP %0, %16 + %17:sgpr(p4) = G_GEP %0, %16 %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0) $sgpr0 = COPY %18 %19:sgpr(s64) = G_CONSTANT i64 17179869184 - %20:sgpr(p2) = G_GEP %0, %19 + %20:sgpr(p4) = G_GEP %0, %19 %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0) $sgpr0 = COPY %21 %22:sgpr(s64) = G_CONSTANT i64 4294967292 - %23:sgpr(p2) = G_GEP %0, %22 + %23:sgpr(p4) = G_GEP %0, %22 %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0) $sgpr0 = COPY %24 %25:sgpr(s64) = G_CONSTANT i64 4294967296 - %26:sgpr(p2) = G_GEP %0, %25 + %26:sgpr(p4) = G_GEP %0, %25 %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0) $sgpr0 = COPY %27 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll index e59e3f3d5c9..00efe521516 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll @@ -18,28 +18,28 @@ define amdgpu_vs void @test_f32(float %arg0) { } ; CHECK-LABEL: name: test_ptr2_byval -; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1 +; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; CHECK: G_LOAD [[S01]] -define amdgpu_vs void @test_ptr2_byval(i32 addrspace(2)* byval %arg0) { - %tmp0 = load volatile i32, i32 addrspace(2)* %arg0 +define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) { + %tmp0 = load volatile i32, i32 addrspace(4)* %arg0 ret void } ; CHECK-LABEL: name: test_ptr2_inreg -; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1 +; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 ; CHECK: G_LOAD [[S01]] -define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(2)* inreg %arg0) { - %tmp0 = load volatile i32, i32 addrspace(2)* %arg0 +define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) { + %tmp0 = load volatile i32, i32 addrspace(4)* %arg0 ret void } ; CHECK-LABEL: name: test_sgpr_alignment0 ; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0 -; CHECK: [[S23:%[0-9]+]]:_(p2) = COPY $sgpr2_sgpr3 +; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3 ; CHECK: G_LOAD [[S23]] ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]] -define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(2)* inreg %arg1) { - %tmp0 = load volatile i32, i32 addrspace(2)* %arg1 +define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) { + %tmp0 = load volatile i32, i32 addrspace(4)* %arg1 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir index 44676305602..5e59d8287f0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir @@ -3,7 +3,7 @@ # REQUIRES: global-isel --- | - define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void } + define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) { ret void } define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) { %tmp0 = load i32, i32 addrspace(1)* %ptr1 ret void @@ -30,7 +30,7 @@ legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 - %0:_(p2) = COPY $sgpr0_sgpr1 + %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0) ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll index 6629188131d..2097d5119f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -9,10 +9,10 @@ ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -21,10 +21,10 @@ entry: ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -36,10 +36,10 @@ entry: ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -51,10 +51,10 @@ entry: ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; XGCN: s_endpgm -define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -65,10 +65,10 @@ entry: ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -79,10 +79,10 @@ entry: ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) { entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144 + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index a6c2901bd42..252e19e73c3 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -27,9 +27,9 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; VI: s_add_i32 ; VI: s_add_i32 -define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { - %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 - %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 +define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 + %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -41,8 +41,8 @@ define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; VI: s_add_i32 ; VI: s_add_i32 -define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { - %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 +define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %add = add <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index 6353308aa3d..95bbe958e93 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -100,8 +100,8 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %p ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} -define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { - %stof = addrspacecast i32 addrspace(2)* %ptr to i32* +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 { + %stof = addrspacecast i32 addrspace(4)* %ptr to i32* %ld = load volatile i32, i32* %stof ret void } @@ -160,8 +160,8 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 { ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 { - %ftos = addrspacecast i32* %ptr to i32 addrspace(2)* - load volatile i32, i32 addrspace(2)* %ftos + %ftos = addrspacecast i32* %ptr to i32 addrspace(4)* + load volatile i32, i32 addrspace(4)* %ftos ret void } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll index ef742f56fae..5d0be382178 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -4,9 +4,9 @@ ; This test just checks that the compiler doesn't crash. ; FUNC-LABEL: {{^}}v32i8_to_v8i32: -define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { +define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(4)* inreg) #0 { entry: - %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 + %1 = load <32 x i8>, <32 x i8> addrspace(4)* %0 %2 = bitcast <32 x i8> %1 to <8 x i32> %3 = extractelement <8 x i32> %2, i32 1 %4 = icmp ne i32 %3, 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index e48da47a027..f37ba76fc84 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -48,12 +48,12 @@ ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() -; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(2)* [[DISPATCH_PTR]] to i32 addrspace(2)* -; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 1 -; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP0]], align 4, !invariant.load !0 -; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 2 -; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0 +; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* +; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 1 +; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP0]], align 4, !invariant.load !0 +; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 2 +; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP1]], align 4, !range !1, !invariant.load !0 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16 ; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2 diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 51cd6c43e03..b097484b0bd 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -8,10 +8,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 declare i32 @llvm.amdgcn.workitem.id.z() #0 -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 ; HSA: define void @use_workitem_id_x() #1 { @@ -58,15 +58,15 @@ define void @use_workgroup_id_z() #1 { ; HSA: define void @use_dispatch_ptr() #7 { define void @use_dispatch_ptr() #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + store volatile i8 addrspace(4)* %dispatch.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } ; HSA: define void @use_queue_ptr() #8 { define void @use_queue_ptr() #1 { - %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() - store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef + %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + store volatile i8 addrspace(4)* %queue.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } @@ -186,22 +186,22 @@ define void @call_recursive_use_workitem_id_y() #1 { ; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #8 { define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)* + store volatile i32 0, i32 addrspace(2)* %stof ret void } ; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 { define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)* + store volatile i32 0, i32 addrspace(2)* %stof ret void } ; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 { define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 { - %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* - store volatile i32 0, i32 addrspace(4)* %stof + %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)* + store volatile i32 0, i32 addrspace(2)* %stof call void @func_indirect_use_queue_ptr() ret void } @@ -226,8 +226,8 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { ; HSA: define void @use_kernarg_segment_ptr() #14 { define void @use_kernarg_segment_ptr() #1 { - %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef + %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + store volatile i8 addrspace(4)* %kernarg.segment.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } @@ -239,15 +239,15 @@ define void @func_indirect_use_kernarg_segment_ptr() #1 { ; HSA: define amdgpu_kernel void @kern_use_implicitarg_ptr() #15 { define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } ; HSA: define void @use_implicitarg_ptr() #15 { define void @use_implicitarg_ptr() #1 { - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 266df4debe4..5a9d72d36be 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -8,9 +8,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 declare i32 @llvm.amdgcn.workitem.id.z() #0 -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { @@ -149,27 +149,27 @@ define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { ; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 { define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* - %val = load i32, i32 addrspace(2)* %bc + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %bc store i32 %val, i32 addrspace(1)* %ptr ret void } ; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 { define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() - %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* - %val = load i32, i32 addrspace(2)* %bc + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %bc store i32 %val, i32 addrspace(1)* %ptr ret void } ; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 { define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* - %val = load i32, i32 addrspace(2)* %bc + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %bc store i32 %val, i32 addrspace(1)* %ptr ret void } @@ -210,9 +210,9 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %p ret void } -; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { -define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 { - %stof = addrspacecast i32 addrspace(2)* %ptr to i32* +; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 { +define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 { + %stof = addrspacecast i32 addrspace(4)* %ptr to i32* %ld = load volatile i32, i32* %stof ret void } @@ -226,8 +226,8 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #1 { ; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 { define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 { - %ftos = addrspacecast i32* %ptr to i32 addrspace(2)* - %ld = load volatile i32, i32 addrspace(2)* %ftos + %ftos = addrspacecast i32* %ptr to i32 addrspace(4)* + %ld = load volatile i32, i32 addrspace(4)* %ftos ret void } diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index ba632f97cda..c1ea13421dc 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -358,7 +358,7 @@ bb0: br i1 %cmp0, label %bb2, label %bb1 bb1: - %val = load volatile i32, i32 addrspace(2)* undef + %val = load volatile i32, i32 addrspace(4)* undef %cmp1 = icmp eq i32 %val, 3 br i1 %cmp1, label %bb3, label %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 5ad8d92e71c..2d3e35a6d28 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -345,7 +345,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { - %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr call void @external_void_func_v8i32(<8 x i32> %val) ret void @@ -359,7 +359,7 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { - %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr call void @external_void_func_v16i32(<16 x i32> %val) ret void @@ -377,7 +377,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { - %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr call void @external_void_func_v32i32(<32 x i32> %val) ret void @@ -405,7 +405,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { - %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0 %val1 = load i32, i32 addrspace(1)* undef call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1) @@ -430,7 +430,7 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { - %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef + %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 call void @external_void_func_struct_i8_i32({ i8, i32 } %val) ret void @@ -516,7 +516,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GCN-LABEL: {{^}}test_call_external_void_func_v16i8: define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { - %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef + %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr call void @external_void_func_v16i8(<16 x i8> %val) ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll index 989d1b7fb18..6a45dee857c 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -4,9 +4,9 @@ ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 define void @use_dispatch_ptr() #1 { - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %header_ptr + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr ret void } @@ -21,9 +21,9 @@ define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 { ; GCN-LABEL: {{^}}use_queue_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 define void @use_queue_ptr() #1 { - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %header_ptr + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr ret void } @@ -62,9 +62,9 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 { ; GCN-LABEL: {{^}}use_kernarg_segment_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 define void @use_kernarg_segment_ptr() #1 { - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %header_ptr + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %header_ptr ret void } @@ -435,17 +435,17 @@ define void @use_every_sgpr_input() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -515,17 +515,17 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -573,17 +573,17 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 { store volatile i32 0, i32 addrspace(5)* %alloca - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* - %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -603,10 +603,10 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 { declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workgroup.id.y() #0 declare i32 @llvm.amdgcn.workgroup.id.z() #0 -declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 -declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 -declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index 714d433c698..f2addc16246 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -87,12 +87,12 @@ define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 entry: %out.gep = getelementptr i32, i32* %out, i64 999999 %in.gep = getelementptr i32, i32* %in, i64 7 - %cast = addrspacecast i32* %in.gep to i32 addrspace(2)* + %cast = addrspacecast i32* %in.gep to i32 addrspace(4)* %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %cast + %tmp1 = load i32, i32 addrspace(4)* %cast br label %endif endif: diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index 2769f5ff658..4419216aeed 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -268,23 +268,23 @@ done: } ; OPT-LABEL: @test_sink_constant_small_offset_i32 -; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -297,23 +297,23 @@ done: } ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 -; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: ; GCN: s_and_saveexec_b64 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -326,9 +326,9 @@ done: } ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 -; OPT-SI: getelementptr i32, i32 addrspace(2)* -; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* -; OPT-VI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-SI: getelementptr i32, i32 addrspace(4)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* +; OPT-VI-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: @@ -337,16 +337,16 @@ done: ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -359,8 +359,8 @@ done: } ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 -; OPT-SI: getelementptr i32, i32 addrspace(2)* -; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-SI: getelementptr i32, i32 addrspace(4)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: @@ -369,16 +369,16 @@ done: ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}} ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -391,7 +391,7 @@ done: } ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 -; OPT: getelementptr i32, i32 addrspace(2)* +; OPT: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: @@ -400,16 +400,16 @@ done: ; GCN: s_addc_u32 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -430,16 +430,16 @@ done: ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -452,9 +452,9 @@ done: } ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 -; OPT-SI: getelementptr i32, i32 addrspace(2)* -; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* -; OPT-VI: getelementptr i32, i32 addrspace(2)* +; OPT-SI: getelementptr i32, i32 addrspace(4)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)* +; OPT-VI: getelementptr i32, i32 addrspace(4)* ; OPT: br i1 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: @@ -468,16 +468,16 @@ done: ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} ; GCN: s_or_b64 exec, exec -define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %tmp1 = load i32, i32 addrspace(2)* %in.gep + %tmp1 = load i32, i32 addrspace(4)* %in.gep br label %endif endif: @@ -524,17 +524,17 @@ bb34: ; OPT: br i1 %tmp0, ; OPT: if: ; OPT: getelementptr i8, {{.*}} 4095 -define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) { entry: %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 - %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095 + %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp0 = icmp eq i32 %tid, 0 br i1 %tmp0, label %endif, label %if if: - %bitcast = bitcast i8 addrspace(2)* %in.gep to i32 addrspace(2)* - %tmp1 = load i32, i32 addrspace(2)* %bitcast, align 1 + %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)* + %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1 br label %endif endif: diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll index 74404989f8c..626a6e2c5b8 100644 --- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -32,9 +32,9 @@ endif: ; GCN: v_add_f64 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 -define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(4)* %in) #0 { entry: - %v = load double, double addrspace(2)* %in + %v = load double, double addrspace(4)* %in %cc = fcmp oeq double %v, 1.000000e+00 br i1 %cc, label %if, label %endif diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll index d129ca5c140..6061f53e959 100644 --- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll +++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll @@ -187,9 +187,9 @@ endif: ; GCN: [[ENDIF]]: ; GCN: buffer_store_dword -define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 { +define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 { entry: - %v = load i32, i32 addrspace(2)* %in + %v = load i32, i32 addrspace(4)* %in %cc = fcmp oeq float %cnd, 1.000000e+00 br i1 %cc, label %if, label %endif @@ -206,9 +206,9 @@ endif: ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: ; GCN: v_cndmask_b32 -define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 { +define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 { entry: - %v = load float, float addrspace(2)* %in + %v = load float, float addrspace(4)* %in %cc = fcmp oeq float %v, 1.000000e+00 br i1 %cc, label %if, label %endif @@ -248,9 +248,9 @@ endif: ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]] -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load i32, i32 addrspace(2)* %in + %v = load i32, i32 addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif @@ -295,9 +295,9 @@ endif: ; GCN: s_addc_u32 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load i64, i64 addrspace(2)* %in + %v = load i64, i64 addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif @@ -320,9 +320,9 @@ endif: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load <3 x i32>, <3 x i32> addrspace(2)* %in + %v = load <3 x i32>, <3 x i32> addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif @@ -345,9 +345,9 @@ endif: ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 { +define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 { entry: - %v = load <4 x i32>, <4 x i32> addrspace(2)* %in + %v = load <4 x i32>, <4 x i32> addrspace(4)* %in %cc = icmp eq i32 %cond, 1 br i1 %cc, label %if, label %endif diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 1e19ddf2b0a..e1fe6b22d6d 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -8,8 +8,8 @@ ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN-DAG: buffer_store_short [[VELT0]] ; GCN-DAG: buffer_store_short [[VELT1]] -define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %p0 = extractelement <2 x half> %vec, i32 0 %p1 = extractelement <2 x half> %vec, i32 1 %out1 = getelementptr half, half addrspace(1)* %out, i32 10 @@ -26,8 +26,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN: buffer_store_short [[VELT1]] ; GCN: ScratchSize: 0 -define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 { - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %elt = extractelement <2 x half> %vec, i32 %idx store half %elt, half addrspace(1)* %out, align 2 ret void @@ -45,12 +45,12 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace( ; SI: buffer_store_short [[ELT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] ; GCN: ScratchSize: 0{{$}} -define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %idx = load i32, i32 addrspace(1)* %gep %elt = extractelement <2 x half> %vec, i32 %idx store half %elt, half addrspace(1)* %out.gep, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index 06a0e2c7b65..39fc014859a 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -9,8 +9,8 @@ ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN-DAG: buffer_store_short [[VELT0]] ; GCN-DAG: buffer_store_short [[VELT1]] -define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %p0 = extractelement <2 x i16> %vec, i32 0 %p1 = extractelement <2 x i16> %vec, i32 1 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 @@ -27,8 +27,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] ; GCN: buffer_store_short [[VELT1]] ; GCN: ScratchSize: 0 -define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, i16 addrspace(1)* %out, align 2 ret void @@ -45,13 +45,13 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1 ; SI: buffer_store_short [[ELT]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] ; GCN: ScratchSize: 0{{$}} -define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { +define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext %idx = load volatile i32, i32 addrspace(1)* %gep - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, i16 addrspace(1)* %out.gep, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/fence-barrier.ll b/llvm/test/CodeGen/AMDGPU/fence-barrier.ll index 8c6c17e5e57..f10e4381a3f 100644 --- a/llvm/test/CodeGen/AMDGPU/fence-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/fence-barrier.ll @@ -1,8 +1,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s ; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs | FileCheck --check-prefix=GCN %s -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() -declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.workgroup.id.x() declare void @llvm.amdgcn.s.barrier() @@ -34,19 +34,19 @@ define amdgpu_kernel void @test_local(i32 addrspace(1)*) { fence syncscope("workgroup") acquire %8 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_local.temp, i64 0, i64 0), align 4 %9 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4 - %10 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %10 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %11 = call i32 @llvm.amdgcn.workitem.id.x() %12 = call i32 @llvm.amdgcn.workgroup.id.x() - %13 = getelementptr inbounds i8, i8 addrspace(2)* %10, i64 4 - %14 = bitcast i8 addrspace(2)* %13 to i16 addrspace(2)* - %15 = load i16, i16 addrspace(2)* %14, align 4 + %13 = getelementptr inbounds i8, i8 addrspace(4)* %10, i64 4 + %14 = bitcast i8 addrspace(4)* %13 to i16 addrspace(4)* + %15 = load i16, i16 addrspace(4)* %14, align 4 %16 = zext i16 %15 to i32 %17 = mul i32 %12, %16 %18 = add i32 %17, %11 - %19 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %19 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %20 = zext i32 %18 to i64 - %21 = bitcast i8 addrspace(2)* %19 to i64 addrspace(2)* - %22 = load i64, i64 addrspace(2)* %21, align 8 + %21 = bitcast i8 addrspace(4)* %19 to i64 addrspace(4)* + %22 = load i64, i64 addrspace(4)* %21, align 8 %23 = add i64 %22, %20 %24 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %23 store i32 %8, i32 addrspace(1)* %24, align 4 @@ -68,56 +68,56 @@ define amdgpu_kernel void @test_global(i32 addrspace(1)*) { ; <label>:4: ; preds = %58, %1 %5 = load i32, i32 addrspace(5)* %3, align 4 %6 = sext i32 %5 to i64 - %7 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %7 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %8 = call i32 @llvm.amdgcn.workitem.id.x() %9 = call i32 @llvm.amdgcn.workgroup.id.x() - %10 = getelementptr inbounds i8, i8 addrspace(2)* %7, i64 4 - %11 = bitcast i8 addrspace(2)* %10 to i16 addrspace(2)* - %12 = load i16, i16 addrspace(2)* %11, align 4 + %10 = getelementptr inbounds i8, i8 addrspace(4)* %7, i64 4 + %11 = bitcast i8 addrspace(4)* %10 to i16 addrspace(4)* + %12 = load i16, i16 addrspace(4)* %11, align 4 %13 = zext i16 %12 to i32 %14 = mul i32 %9, %13 %15 = add i32 %14, %8 - %16 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %16 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %17 = zext i32 %15 to i64 - %18 = bitcast i8 addrspace(2)* %16 to i64 addrspace(2)* - %19 = load i64, i64 addrspace(2)* %18, align 8 + %18 = bitcast i8 addrspace(4)* %16 to i64 addrspace(4)* + %19 = load i64, i64 addrspace(4)* %18, align 8 %20 = add i64 %19, %17 %21 = icmp ult i64 %6, %20 br i1 %21, label %22, label %61 ; <label>:22: ; preds = %4 - %23 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %23 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %24 = call i32 @llvm.amdgcn.workitem.id.x() %25 = call i32 @llvm.amdgcn.workgroup.id.x() - %26 = getelementptr inbounds i8, i8 addrspace(2)* %23, i64 4 - %27 = bitcast i8 addrspace(2)* %26 to i16 addrspace(2)* - %28 = load i16, i16 addrspace(2)* %27, align 4 + %26 = getelementptr inbounds i8, i8 addrspace(4)* %23, i64 4 + %27 = bitcast i8 addrspace(4)* %26 to i16 addrspace(4)* + %28 = load i16, i16 addrspace(4)* %27, align 4 %29 = zext i16 %28 to i32 %30 = mul i32 %25, %29 %31 = add i32 %30, %24 - %32 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %32 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %33 = zext i32 %31 to i64 - %34 = bitcast i8 addrspace(2)* %32 to i64 addrspace(2)* - %35 = load i64, i64 addrspace(2)* %34, align 8 + %34 = bitcast i8 addrspace(4)* %32 to i64 addrspace(4)* + %35 = load i64, i64 addrspace(4)* %34, align 8 %36 = add i64 %35, %33 %37 = add i64 %36, 2184 %38 = trunc i64 %37 to i32 %39 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4 %40 = load i32, i32 addrspace(5)* %3, align 4 %41 = sext i32 %40 to i64 - %42 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %42 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %43 = call i32 @llvm.amdgcn.workitem.id.x() %44 = call i32 @llvm.amdgcn.workgroup.id.x() - %45 = getelementptr inbounds i8, i8 addrspace(2)* %42, i64 4 - %46 = bitcast i8 addrspace(2)* %45 to i16 addrspace(2)* - %47 = load i16, i16 addrspace(2)* %46, align 4 + %45 = getelementptr inbounds i8, i8 addrspace(4)* %42, i64 4 + %46 = bitcast i8 addrspace(4)* %45 to i16 addrspace(4)* + %47 = load i16, i16 addrspace(4)* %46, align 4 %48 = zext i16 %47 to i32 %49 = mul i32 %44, %48 %50 = add i32 %49, %43 - %51 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %51 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %52 = zext i32 %50 to i64 - %53 = bitcast i8 addrspace(2)* %51 to i64 addrspace(2)* - %54 = load i64, i64 addrspace(2)* %53, align 8 + %53 = bitcast i8 addrspace(4)* %51 to i64 addrspace(4)* + %54 = load i64, i64 addrspace(4)* %53, align 8 %55 = add i64 %54, %52 %56 = add i64 %41, %55 %57 = getelementptr inbounds i32, i32 addrspace(1)* %39, i64 %56 @@ -147,19 +147,19 @@ define amdgpu_kernel void @test_global_local(i32 addrspace(1)*) { %2 = alloca i32 addrspace(1)*, align 4, addrspace(5) store i32 addrspace(1)* %0, i32 addrspace(1)* addrspace(5)* %2, align 4 %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4 - %4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %4 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %5 = call i32 @llvm.amdgcn.workitem.id.x() %6 = call i32 @llvm.amdgcn.workgroup.id.x() - %7 = getelementptr inbounds i8, i8 addrspace(2)* %4, i64 4 - %8 = bitcast i8 addrspace(2)* %7 to i16 addrspace(2)* - %9 = load i16, i16 addrspace(2)* %8, align 4 + %7 = getelementptr inbounds i8, i8 addrspace(4)* %4, i64 4 + %8 = bitcast i8 addrspace(4)* %7 to i16 addrspace(4)* + %9 = load i16, i16 addrspace(4)* %8, align 4 %10 = zext i16 %9 to i32 %11 = mul i32 %6, %10 %12 = add i32 %11, %5 - %13 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %13 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %14 = zext i32 %12 to i64 - %15 = bitcast i8 addrspace(2)* %13 to i64 addrspace(2)* - %16 = load i64, i64 addrspace(2)* %15, align 8 + %15 = bitcast i8 addrspace(4)* %13 to i64 addrspace(4)* + %16 = load i64, i64 addrspace(4)* %15, align 8 %17 = add i64 %16, %14 %18 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %17 store i32 1, i32 addrspace(1)* %18, align 4 @@ -178,19 +178,19 @@ define amdgpu_kernel void @test_global_local(i32 addrspace(1)*) { fence syncscope("workgroup") acquire %24 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_global_local.temp, i64 0, i64 0), align 4 %25 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4 - %26 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %26 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %27 = call i32 @llvm.amdgcn.workitem.id.x() %28 = call i32 @llvm.amdgcn.workgroup.id.x() - %29 = getelementptr inbounds i8, i8 addrspace(2)* %26, i64 4 - %30 = bitcast i8 addrspace(2)* %29 to i16 addrspace(2)* - %31 = load i16, i16 addrspace(2)* %30, align 4 + %29 = getelementptr inbounds i8, i8 addrspace(4)* %26, i64 4 + %30 = bitcast i8 addrspace(4)* %29 to i16 addrspace(4)* + %31 = load i16, i16 addrspace(4)* %30, align 4 %32 = zext i16 %31 to i32 %33 = mul i32 %28, %32 %34 = add i32 %33, %27 - %35 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %35 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %36 = zext i32 %34 to i64 - %37 = bitcast i8 addrspace(2)* %35 to i64 addrspace(2)* - %38 = load i64, i64 addrspace(2)* %37, align 8 + %37 = bitcast i8 addrspace(4)* %35 to i64 addrspace(4)* + %38 = load i64, i64 addrspace(4)* %37, align 8 %39 = add i64 %38, %36 %40 = getelementptr inbounds i32, i32 addrspace(1)* %25, i64 %39 store i32 %24, i32 addrspace(1)* %40, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 4a24f5e285b..758def5b044 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -164,7 +164,7 @@ define <5 x i32> @v5i32_func_void() #0 { ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <8 x i32> @v8i32_func_void() #0 { - %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr ret <8 x i32> %val } @@ -177,7 +177,7 @@ define <8 x i32> @v8i32_func_void() #0 { ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <16 x i32> @v16i32_func_void() #0 { - %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr ret <16 x i32> %val } @@ -194,7 +194,7 @@ define <16 x i32> @v16i32_func_void() #0 { ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <32 x i32> @v32i32_func_void() #0 { - %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr ret <32 x i32> %val } @@ -214,7 +214,7 @@ define <2 x i64> @v2i64_func_void() #0 { ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <3 x i64> @v3i64_func_void() #0 { - %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(4)* undef %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr ret <3 x i64> %val } @@ -225,7 +225,7 @@ define <3 x i64> @v3i64_func_void() #0 { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <4 x i64> @v4i64_func_void() #0 { - %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(4)* undef %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr ret <4 x i64> %val } @@ -237,7 +237,7 @@ define <4 x i64> @v4i64_func_void() #0 { ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <5 x i64> @v5i64_func_void() #0 { - %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(4)* undef %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr ret <5 x i64> %val } @@ -250,7 +250,7 @@ define <5 x i64> @v5i64_func_void() #0 { ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <8 x i64> @v8i64_func_void() #0 { - %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(4)* undef %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr ret <8 x i64> %val } @@ -267,7 +267,7 @@ define <8 x i64> @v8i64_func_void() #0 { ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <16 x i64> @v16i64_func_void() #0 { - %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(4)* undef %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr ret <16 x i64> %val } @@ -309,7 +309,7 @@ define <4 x i16> @v4i16_func_void() #0 { ; GFX9: v_lshrrev_b32_e32 v1, 16, v0 ; GCN: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { - %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr ret <5 x i16> %val } @@ -319,7 +319,7 @@ define <5 x i16> @v5i16_func_void() #0 { ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <8 x i16> @v8i16_func_void() #0 { - %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(4)* undef %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr ret <8 x i16> %val } @@ -330,7 +330,7 @@ define <8 x i16> @v8i16_func_void() #0 { ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <16 x i16> @v16i16_func_void() #0 { - %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(4)* undef %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr ret <16 x i16> %val } @@ -342,7 +342,7 @@ define <16 x i16> @v16i16_func_void() #0 { ; GCN-DAG: v14 ; GCN-DAG: v15 define <16 x i8> @v16i8_func_void() #0 { - %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr ret <16 x i8> %val } @@ -356,7 +356,7 @@ define <16 x i8> @v16i8_func_void() #0 { ; GFX89-DAG: v_lshrrev_b16_e32 v1, 8, v0 ; GCN: s_setpc_b64 define <4 x i8> @v4i8_func_void() #0 { - %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(4)* undef %val = load <4 x i8>, <4 x i8> addrspace(1)* %ptr ret <4 x i8> %val } @@ -427,7 +427,7 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0) ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { - %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef + %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(4)* undef %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr ret <33 x i32> %val } @@ -469,7 +469,7 @@ define <33 x i32> @v33i32_func_void() #0 { ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { - %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef + %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr ret { <32 x i32>, i32 }%val } @@ -511,7 +511,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { - %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef + %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(4)* undef %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr ret { i32, <32 x i32> }%val } diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll index 1898c8fb63e..3ebc8e1e7df 100644 --- a/llvm/test/CodeGen/AMDGPU/global-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOHSA %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s -@private1 = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0] -@private2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0] -@available_externally = available_externally addrspace(2) global [256 x i32] zeroinitializer +@private1 = private unnamed_addr addrspace(4) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0] +@private2 = private unnamed_addr addrspace(4) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0] +@available_externally = available_externally addrspace(4) global [256 x i32] zeroinitializer ; GCN-LABEL: {{^}}private_test: ; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} @@ -27,11 +27,11 @@ ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4 define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) { - %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index - %val = load float, float addrspace(2)* %ptr + %ptr = getelementptr [4 x float], [4 x float] addrspace(4) * @private1, i32 0, i32 %index + %val = load float, float addrspace(4)* %ptr store volatile float %val, float addrspace(1)* %out - %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index - %val2 = load float, float addrspace(2)* %ptr2 + %ptr2 = getelementptr [4 x float], [4 x float] addrspace(4) * @private2, i32 0, i32 %index + %val2 = load float, float addrspace(4)* %ptr2 store volatile float %val2, float addrspace(1)* %out ret void } @@ -41,8 +41,8 @@ define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) { ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4 define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) { - %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1 - %val = load i32, i32 addrspace(2)* %ptr + %ptr = getelementptr [256 x i32], [256 x i32] addrspace(4)* @available_externally, i32 0, i32 1 + %val = load i32, i32 addrspace(4)* %ptr store i32 %val, i32 addrspace(1)* %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll index 0903542bac4..03d99baaafa 100644 --- a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll +++ b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll @@ -4,9 +4,9 @@ ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 +@b = internal addrspace(4) constant [1 x i16] [ i16 7 ], align 2 -@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4 +@float_gv = internal unnamed_addr addrspace(4) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4 ; FUNC-LABEL: {{^}}float: ; GCN: s_load_dword @@ -17,13 +17,13 @@ ; EG-NOT: MOV define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) { entry: - %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index - %1 = load float, float addrspace(2)* %0 + %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index + %1 = load float, float addrspace(4)* %0 store float %1, float addrspace(1)* %out ret void } -@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4 +@i32_gv = internal unnamed_addr addrspace(4) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4 ; FUNC-LABEL: {{^}}i32: @@ -35,8 +35,8 @@ entry: ; EG-NOT: MOV define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) { entry: - %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index - %1 = load i32, i32 addrspace(2)* %0 + %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(4)* @i32_gv, i32 0, i32 %index + %1 = load i32, i32 addrspace(4)* %0 store i32 %1, i32 addrspace(1)* %out ret void } @@ -44,7 +44,7 @@ entry: %struct.foo = type { float, [5 x i32] } -@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ] +@struct_foo_gv = internal unnamed_addr addrspace(4) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ] ; FUNC-LABEL: {{^}}struct_foo_gv_load: ; GCN: s_load_dword @@ -54,13 +54,13 @@ entry: ; EG-NOT: MOVA_INT ; EG-NOT: MOV define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index - %load = load i32, i32 addrspace(2)* %gep, align 4 + %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(4)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index + %load = load i32, i32 addrspace(4)* %gep, align 4 store i32 %load, i32 addrspace(1)* %out, align 4 ret void } -@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>, +@array_v1_gv = internal addrspace(4) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>, <1 x i32> <i32 2>, <1 x i32> <i32 3>, <1 x i32> <i32 4> ] @@ -73,8 +73,8 @@ define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index ; EG-NOT: MOVA_INT ; EG-NOT: MOV define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index - %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4 + %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(4)* @array_v1_gv, i32 0, i32 %index + %load = load <1 x i32>, <1 x i32> addrspace(4)* %gep, align 4 store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 ret void } @@ -90,8 +90,8 @@ entry: br i1 %0, label %if, label %else if: - %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index - %2 = load float, float addrspace(2)* %1 + %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index + %2 = load float, float addrspace(4)* %1 store float %2, float addrspace(1)* %out br label %endif diff --git a/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll b/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll index a00f5e2669d..ff17a9e530b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll @@ -10,9 +10,9 @@ ; HSA: .globl simple_align16 ; HSA: .p2align 5 -define void @simple_align16(i32 addrspace(1)* addrspace(2)* %ptr.out) align 32 { +define void @simple_align16(i32 addrspace(1)* addrspace(4)* %ptr.out) align 32 { entry: - %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out store i32 0, i32 addrspace(1)* %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-func.ll b/llvm/test/CodeGen/AMDGPU/hsa-func.ll index 0bf1c6a23c3..d117cf59ee1 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-func.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-func.ll @@ -51,9 +51,9 @@ ; HSA: .size simple, .Lfunc_end0-simple ; HSA: ; Function info: ; HSA-NOT: COMPUTE_PGM_RSRC2 -define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) { +define void @simple(i32 addrspace(1)* addrspace(4)* %ptr.out) { entry: - %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out store i32 0, i32 addrspace(1)* %out ret void } @@ -61,9 +61,9 @@ entry: ; Ignore explicit alignment that is too low. ; HSA: .globl simple_align2 ; HSA: .p2align 2 -define void @simple_align2(i32 addrspace(1)* addrspace(2)* %ptr.out) align 2 { +define void @simple_align2(i32 addrspace(1)* addrspace(4)* %ptr.out) align 2 { entry: - %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out store i32 0, i32 addrspace(1)* %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll index 967dc7502bd..b24e022f124 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -581,7 +581,7 @@ define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) ; CHECK-NEXT: ValueType: I8 ; CHECK-NEXT: AddrSpaceQual: Global define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, - i32 addrspace(2)* %c, + i32 addrspace(4)* %c, i32 addrspace(3)* %l) !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !25 { diff --git a/llvm/test/CodeGen/AMDGPU/image-schedule.ll b/llvm/test/CodeGen/AMDGPU/image-schedule.ll index 856ba04a791..6f8060f1d55 100644 --- a/llvm/test/CodeGen/AMDGPU/image-schedule.ll +++ b/llvm/test/CodeGen/AMDGPU/image-schedule.ll @@ -20,21 +20,21 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1 %.0.vec.insert = insertelement <2 x i32> undef, i32 %arg2, i32 0 %.4.vec.insert = shufflevector <2 x i32> %.0.vec.insert, <2 x i32> %tmp6, <2 x i32> <i32 0, i32 3> %tmp7 = bitcast <2 x i32> %.4.vec.insert to i64 - %tmp8 = inttoptr i64 %tmp7 to [4294967295 x i8] addrspace(2)* + %tmp8 = inttoptr i64 %tmp7 to [4294967295 x i8] addrspace(4)* %tmp9 = add <3 x i32> %arg3, %arg5 - %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp8, i64 0, i64 32 - %tmp11 = bitcast i8 addrspace(2)* %tmp10 to <8 x i32> addrspace(2)*, !amdgpu.uniform !0 - %tmp12 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp11, align 16 + %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 32 + %tmp11 = bitcast i8 addrspace(4)* %tmp10 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0 + %tmp12 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp11, align 16 %tmp13 = shufflevector <3 x i32> %tmp9, <3 x i32> undef, <2 x i32> <i32 0, i32 1> %tmp14 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp12, i32 15, i1 false, i1 false, i1 false, i1 false) #0 - %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(2)* - %tmp16 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp15, align 16 + %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(4)* + %tmp16 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16 call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp14, <2 x i32> %tmp13, <8 x i32> %tmp16, i32 15, i1 false, i1 false, i1 false, i1 false) #0 - %tmp17 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp15, align 16 + %tmp17 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16 %tmp18 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp17, i32 15, i1 false, i1 false, i1 false, i1 false) #0 - %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp8, i64 0, i64 64 - %tmp20 = bitcast i8 addrspace(2)* %tmp19 to <8 x i32> addrspace(2)*, !amdgpu.uniform !0 - %tmp21 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp20, align 16 + %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 64 + %tmp20 = bitcast i8 addrspace(4)* %tmp19 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0 + %tmp21 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp20, align 16 call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp18, <2 x i32> %tmp13, <8 x i32> %tmp21, i32 15, i1 false, i1 false, i1 false, i1 false) #0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index de973eb0ba7..e8f25fe3bf1 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -10,8 +10,8 @@ ; GFX9-NOT: lshr ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]] -define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out ret void @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; GFX9-NOT: [[ELT0]] ; GFX9-NOT: [[VEC]] ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]] -define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out ret void @@ -48,8 +48,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* % ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] ; GFX9-DAG: ; use [[ELT1]] -define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt1 = extractelement <2 x i16> %vec, i32 1 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -68,8 +68,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad ; GFX9-NOT: [[ELT0]] ; GFX9-NOT: [[VEC]] ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]] -define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -88,8 +88,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]] ; GFX9: ; use [[ELT1]] -define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -113,8 +113,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] ; GFX9: ; use [[ELT_HI]] ; GFX9: ; use [[VEC_HI]] -define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 %vec.hi = extractelement <2 x i16> %vec, i32 1 @@ -137,8 +137,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7 -define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out ret void @@ -153,8 +153,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, ; GCN-NOT: shlr ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]] -define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out ret void @@ -167,8 +167,8 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* % ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]] -define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 store <2 x half> %vecins, <2 x half> addrspace(1)* %out ret void @@ -182,8 +182,8 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500 -define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { - %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { + %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 store <2 x half> %vecins, <2 x half> addrspace(1)* %out ret void @@ -399,9 +399,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]] ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 { - %idx = load volatile i32, i32 addrspace(2)* %idx.ptr - %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr +define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { + %idx = load volatile i32, i32 addrspace(4)* %idx.ptr + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll index eea26192ed3..d39ee12abde 100644 --- a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -22,8 +22,8 @@ define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointe ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}} ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]: -define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 { - %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0 +define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(4)* dereferenceable(4096) nonnull %in) #0 { + %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(4)* %in, !invariant.load !0 %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 store i16 123, i16 addrspace(1)* %ptr, align 4 store i16 456, i16 addrspace(1)* %ptr.1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll index 564d2b32964..1d370aba6da 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll @@ -14,10 +14,10 @@ ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc -define amdgpu_vs void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <4 x i32>] addrspace(2)* byval %arg3, [17 x <4 x i32>] addrspace(2)* inreg %arg4, [17 x <4 x i32>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { +define amdgpu_vs void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <32 x i8>] addrspace(4)* byval %arg2, [2 x <4 x i32>] addrspace(4)* byval %arg3, [17 x <4 x i32>] addrspace(4)* inreg %arg4, [17 x <4 x i32>] addrspace(4)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { main_body: - %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(2)* %arg3, i64 0, i32 1 - %tmp10 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(4)* %arg3, i64 0, i32 1 + %tmp10 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp11 = shl i32 %arg6, 2 %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) %tmp13 = bitcast i32 %tmp12 to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll index 92208e7fe17..b6f9f951d9b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -7,13 +7,13 @@ ; GCN: enable_sgpr_dispatch_ptr = 1 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 define amdgpu_kernel void @test(i32 addrspace(1)* %out) { - %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* - %value = load i32, i32 addrspace(2)* %header_ptr + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)* + %value = load i32, i32 addrspace(4)* %header_ptr store i32 %value, i32 addrspace(1)* %out ret void } -declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll index 437ce7f373d..a084fa08b80 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll @@ -2,23 +2,23 @@ ; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 { - %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() - %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* - %value = load i32, i32 addrspace(2)* %header_ptr + %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)* + %value = load i32, i32 addrspace(4)* %header_ptr store i32 %value, i32 addrspace(1)* %out ret void } ; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target define void @test_func(i32 addrspace(1)* %out) #1 { - %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() - %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* - %value = load i32, i32 addrspace(2)* %header_ptr + %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)* + %value = load i32, i32 addrspace(4)* %header_ptr store i32 %value, i32 addrspace(1)* %out ret void } -declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0 attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll index d6dd6ffa723..3a69ef673b8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -10,9 +10,9 @@ define amdgpu_ps i32 @test_ps() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() - %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %buffer_ptr ret i32 %value } @@ -23,13 +23,13 @@ define amdgpu_ps i32 @test_ps() #1 { define amdgpu_cs i32 @test_cs() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca - %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() - %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* - %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)* + %value = load volatile i32, i32 addrspace(4)* %buffer_ptr ret i32 %value } -declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0 attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index e071e6749af..9036a0b9d8c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -11,9 +11,9 @@ ; HSA: s_load_dword s0, s[4:5], 0x0 define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 { - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* - %load = load volatile i32, i32 addrspace(2)* %cast + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %load = load volatile i32, i32 addrspace(4)* %cast ret void } @@ -26,9 +26,9 @@ define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 { ; HSA: s_load_dword s0, s[4:5], 0x1c define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* - %load = load volatile i32, i32 addrspace(2)* %cast + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %load = load volatile i32, i32 addrspace(4)* %cast ret void } @@ -38,9 +38,9 @@ define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #1 { - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* - %load = load volatile i32, i32 addrspace(2)* %cast + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %load = load volatile i32, i32 addrspace(4)* %cast ret void } @@ -86,12 +86,12 @@ define void @func_call_implicitarg_ptr_func() #1 { ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}} ; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0{{$}} define void @func_kernarg_implicitarg_ptr() #1 { - %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - %cast.kernarg.segment.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)* - %cast.implicitarg = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* - %load0 = load volatile i32, i32 addrspace(2)* %cast.kernarg.segment.ptr - %load1 = load volatile i32, i32 addrspace(2)* %cast.implicitarg + %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* + %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr + %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg ret void } @@ -106,8 +106,8 @@ define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) ret void } -declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #2 -declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #2 +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2 attributes #0 = { nounwind noinline } attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 70e6b408ca2..df14bbce415 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -11,10 +11,10 @@ ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 { - %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)* - %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 - %value = load i32, i32 addrspace(2)* %gep + %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* + %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(4)* %gep store i32 %value, i32 addrspace(1)* %out ret void } @@ -23,10 +23,10 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 { ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15 define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { - %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* - %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 - %value = load i32, i32 addrspace(2)* %gep + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(4)* %gep store i32 %value, i32 addrspace(1)* %out ret void } @@ -42,9 +42,9 @@ define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { ; MESA: buffer_store_dword [[V_VAL]] ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]] define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 { - %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() - %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* - %val = load i32, i32 addrspace(2)* %arg.ptr + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val = load i32, i32 addrspace(4)* %arg.ptr store i32 %val, i32 addrspace(1)* %out ret void } @@ -53,16 +53,16 @@ define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x ; HSA: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5] define amdgpu_kernel void @test_no_kernargs() #1 { - %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)* - %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 - %value = load i32, i32 addrspace(2)* %gep + %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* + %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(4)* %gep store volatile i32 %value, i32 addrspace(1)* undef ret void } -declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 -declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 attributes #0 = { nounwind readnone } attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll index 9200fe7c67b..f8c60451ac7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -7,13 +7,13 @@ ; GCN: enable_sgpr_queue_ptr = 1 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 define amdgpu_kernel void @test(i32 addrspace(1)* %out) { - %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 - %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* - %value = load i32, i32 addrspace(2)* %header_ptr + %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* + %value = load i32, i32 addrspace(4)* %header_ptr store i32 %value, i32 addrspace(1)* %out ret void } -declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll index 77eb4900ea5..cb3f8c5d17b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -3,7 +3,7 @@ declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i1) nounwind declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind -declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(2)* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(4)* nocapture, i64, i1) nounwind ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: @@ -328,8 +328,8 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 ad } ; Test shouldConvertConstantLoadToIntImm -@hello.align4 = private unnamed_addr addrspace(2) constant [16 x i8] c"constant string\00", align 4 -@hello.align1 = private unnamed_addr addrspace(2) constant [16 x i8] c"constant string\00", align 1 +@hello.align4 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 4 +@hello.align1 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 1 ; FUNC-LABEL: {{^}}test_memcpy_const_string_align4: ; SI: s_getpc_b64 @@ -341,8 +341,8 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 ad ; SI-DAG: buffer_store_dwordx4 ; SI-DAG: buffer_store_dwordx4 define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind { - %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)* - call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(2)* align 4 %str, i64 32, i1 false) + %str = bitcast [16 x i8] addrspace(4)* @hello.align4 to i8 addrspace(4)* + call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(4)* align 4 %str, i64 32, i1 false) ret void } @@ -366,7 +366,7 @@ define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noal ; SI: buffer_store_byte ; SI: buffer_store_byte define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind { - %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)* - call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i1 false) + %str = bitcast [16 x i8] addrspace(4)* @hello.align1 to i8 addrspace(4)* + call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(4)* %str, i64 32, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 0050d1a4f87..56e21bc9bd6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -6,8 +6,8 @@ ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}] ; GCN-NOHSA: buffer_store_dwordx2 ; GCN-HSA: flat_store_dwordx2 -define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { - %ld = load double, double addrspace(2)* %in +define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(4)* %in) #0 { + %ld = load double, double addrspace(4)* %in store double %ld, double addrspace(1)* %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 789b5e9734d..61a2aca3891 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -9,57 +9,57 @@ ; EG: VTX_READ_8 ; EG: AND_INT -define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { - %load = load i1, i1 addrspace(2)* %in +define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 { + %load = load i1, i1 addrspace(4)* %in store i1 %load, i1 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v2i1: -define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <2 x i1>, <2 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <2 x i1>, <2 x i1> addrspace(4)* %in store <2 x i1> %load, <2 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v3i1: -define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <3 x i1>, <3 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <3 x i1>, <3 x i1> addrspace(4)* %in store <3 x i1> %load, <3 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v4i1: -define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <4 x i1>, <4 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <4 x i1>, <4 x i1> addrspace(4)* %in store <4 x i1> %load, <4 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v8i1: -define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <8 x i1>, <8 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <8 x i1>, <8 x i1> addrspace(4)* %in store <8 x i1> %load, <8 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v16i1: -define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <16 x i1>, <16 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <16 x i1>, <16 x i1> addrspace(4)* %in store <16 x i1> %load, <16 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v32i1: -define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <32 x i1>, <32 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <32 x i1>, <32 x i1> addrspace(4)* %in store <32 x i1> %load, <32 x i1> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_load_v64i1: -define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <64 x i1>, <64 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <64 x i1>, <64 x i1> addrspace(4)* %in store <64 x i1> %load, <64 x i1> addrspace(1)* %out ret void } @@ -67,8 +67,8 @@ define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32: ; GCN: buffer_load_ubyte ; GCN: buffer_store_dword -define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { - %a = load i1, i1 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 { + %a = load i1, i1 addrspace(4)* %in %ext = zext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out ret void @@ -81,136 +81,136 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i ; EG: VTX_READ_8 ; EG: BFE_INT -define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { - %a = load i1, i1 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 { + %a = load i1, i1 addrspace(4)* %in %ext = sext i1 %a to i32 store i32 %ext, i32 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32: -define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <1 x i1>, <1 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <1 x i1>, <1 x i1> addrspace(4)* %in %ext = zext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32: -define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <1 x i1>, <1 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <1 x i1>, <1 x i1> addrspace(4)* %in %ext = sext <1 x i1> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32: -define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <2 x i1>, <2 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <2 x i1>, <2 x i1> addrspace(4)* %in %ext = zext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32: -define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <2 x i1>, <2 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <2 x i1>, <2 x i1> addrspace(4)* %in %ext = sext <2 x i1> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32: -define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <3 x i1>, <3 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <3 x i1>, <3 x i1> addrspace(4)* %in %ext = zext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32: -define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <3 x i1>, <3 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <3 x i1>, <3 x i1> addrspace(4)* %in %ext = sext <3 x i1> %load to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32: -define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <4 x i1>, <4 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <4 x i1>, <4 x i1> addrspace(4)* %in %ext = zext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32: -define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <4 x i1>, <4 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <4 x i1>, <4 x i1> addrspace(4)* %in %ext = sext <4 x i1> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32: -define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <8 x i1>, <8 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <8 x i1>, <8 x i1> addrspace(4)* %in %ext = zext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32: -define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <8 x i1>, <8 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <8 x i1>, <8 x i1> addrspace(4)* %in %ext = sext <8 x i1> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32: -define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <16 x i1>, <16 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <16 x i1>, <16 x i1> addrspace(4)* %in %ext = zext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32: -define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <16 x i1>, <16 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <16 x i1>, <16 x i1> addrspace(4)* %in %ext = sext <16 x i1> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32: -define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <32 x i1>, <32 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <32 x i1>, <32 x i1> addrspace(4)* %in %ext = zext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32: -define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <32 x i1>, <32 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <32 x i1>, <32 x i1> addrspace(4)* %in %ext = sext <32 x i1> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32: -define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <64 x i1>, <64 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <64 x i1>, <64 x i1> addrspace(4)* %in %ext = zext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32: -define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <64 x i1>, <64 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <64 x i1>, <64 x i1> addrspace(4)* %in %ext = sext <64 x i1> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out ret void @@ -221,8 +221,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspac ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]] ; GCN: buffer_store_dwordx2 -define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { - %a = load i1, i1 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 { + %a = load i1, i1 addrspace(4)* %in %ext = zext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out ret void @@ -233,136 +233,136 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] ; GCN: buffer_store_dwordx2 -define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 { - %a = load i1, i1 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 { + %a = load i1, i1 addrspace(4)* %in %ext = sext i1 %a to i64 store i64 %ext, i64 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64: -define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <1 x i1>, <1 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <1 x i1>, <1 x i1> addrspace(4)* %in %ext = zext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64: -define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <1 x i1>, <1 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <1 x i1>, <1 x i1> addrspace(4)* %in %ext = sext <1 x i1> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64: -define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <2 x i1>, <2 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <2 x i1>, <2 x i1> addrspace(4)* %in %ext = zext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64: -define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <2 x i1>, <2 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <2 x i1>, <2 x i1> addrspace(4)* %in %ext = sext <2 x i1> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64: -define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <3 x i1>, <3 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <3 x i1>, <3 x i1> addrspace(4)* %in %ext = zext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64: -define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <3 x i1>, <3 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <3 x i1>, <3 x i1> addrspace(4)* %in %ext = sext <3 x i1> %load to <3 x i64> store <3 x i64> %ext, <3 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64: -define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <4 x i1>, <4 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <4 x i1>, <4 x i1> addrspace(4)* %in %ext = zext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64: -define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <4 x i1>, <4 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <4 x i1>, <4 x i1> addrspace(4)* %in %ext = sext <4 x i1> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64: -define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <8 x i1>, <8 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <8 x i1>, <8 x i1> addrspace(4)* %in %ext = zext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64: -define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <8 x i1>, <8 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <8 x i1>, <8 x i1> addrspace(4)* %in %ext = sext <8 x i1> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64: -define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <16 x i1>, <16 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <16 x i1>, <16 x i1> addrspace(4)* %in %ext = zext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64: -define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <16 x i1>, <16 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <16 x i1>, <16 x i1> addrspace(4)* %in %ext = sext <16 x i1> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64: -define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <32 x i1>, <32 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <32 x i1>, <32 x i1> addrspace(4)* %in %ext = zext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64: -define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <32 x i1>, <32 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <32 x i1>, <32 x i1> addrspace(4)* %in %ext = sext <32 x i1> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64: -define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <64 x i1>, <64 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <64 x i1>, <64 x i1> addrspace(4)* %in %ext = zext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64: -define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 { - %load = load <64 x i1>, <64 x i1> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 { + %load = load <64 x i1>, <64 x i1> addrspace(4)* %in %ext = sext <64 x i1> %load to <64 x i64> store <64 x i64> %ext, <64 x i64> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 72fde04ba39..68ff90e32e6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -8,9 +8,9 @@ ; GCN-HSA: flat_load_ushort ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(4)* %in) { entry: - %ld = load i16, i16 addrspace(2)* %in + %ld = load i16, i16 addrspace(4)* %in store i16 %ld, i16 addrspace(1)* %out ret void } @@ -19,9 +19,9 @@ entry: ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) { entry: - %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in + %ld = load <2 x i16>, <2 x i16> addrspace(4)* %in store <2 x i16> %ld, <2 x i16> addrspace(1)* %out ret void } @@ -31,9 +31,9 @@ entry: ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 -define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { entry: - %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in + %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in store <3 x i16> %ld, <3 x i16> addrspace(1)* %out ret void } @@ -42,9 +42,9 @@ entry: ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) { entry: - %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in + %ld = load <4 x i16>, <4 x i16> addrspace(4)* %in store <4 x i16> %ld, <4 x i16> addrspace(1)* %out ret void } @@ -53,9 +53,9 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) { entry: - %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in + %ld = load <8 x i16>, <8 x i16> addrspace(4)* %in store <8 x i16> %ld, <8 x i16> addrspace(1)* %out ret void } @@ -65,9 +65,9 @@ entry: ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) { entry: - %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in + %ld = load <16 x i16>, <16 x i16> addrspace(4)* %in store <16 x i16> %ld, <16 x i16> addrspace(1)* %out ret void } @@ -80,8 +80,8 @@ entry: ; GCN-HSA: flat_store_dword ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 -define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { - %a = load i16, i16 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %a = load i16, i16 addrspace(4)* %in %ext = zext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out ret void @@ -97,8 +97,8 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 16 -define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { - %a = load i16, i16 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %a = load i16, i16 addrspace(4)* %in %ext = sext i16 %a to i32 store i32 %ext, i32 addrspace(1)* %out ret void @@ -109,8 +109,8 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, ; GCN-HSA: flat_load_ushort ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 -define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { - %load = load <1 x i16>, <1 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { + %load = load <1 x i16>, <1 x i16> addrspace(4)* %in %ext = zext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out ret void @@ -123,8 +123,8 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace( ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 16 -define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { - %load = load <1 x i16>, <1 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { + %load = load <1 x i16>, <1 x i16> addrspace(4)* %in %ext = sext <1 x i16> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out ret void @@ -140,8 +140,8 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace( ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EG: 16 ; EG: 16 -define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { - %load = load <2 x i16>, <2 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { + %load = load <2 x i16>, <2 x i16> addrspace(4)* %in %ext = zext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out ret void @@ -160,8 +160,8 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace( ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { - %load = load <2 x i16>, <2 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { + %load = load <2 x i16>, <2 x i16> addrspace(4)* %in %ext = sext <2 x i16> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out ret void @@ -183,9 +183,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace( ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 65535 ; EG-DAG: 65535 -define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { entry: - %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in + %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in %ext = zext <3 x i16> %ld to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out ret void @@ -204,9 +204,9 @@ entry: ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { entry: - %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in + %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in %ext = sext <3 x i16> %ld to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out ret void @@ -229,8 +229,8 @@ entry: ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 65535 ; EG-DAG: 65535 -define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { - %load = load <4 x i16>, <4 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { + %load = load <4 x i16>, <4 x i16> addrspace(4)* %in %ext = zext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out ret void @@ -254,8 +254,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace( ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 -define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { - %load = load <4 x i16>, <4 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { + %load = load <4 x i16>, <4 x i16> addrspace(4)* %in %ext = sext <4 x i16> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out ret void @@ -288,8 +288,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace( ; EG-DAG: 65535 ; EG-DAG: 65535 ; EG-DAG: 65535 -define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { - %load = load <8 x i16>, <8 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { + %load = load <8 x i16>, <8 x i16> addrspace(4)* %in %ext = zext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out ret void @@ -322,8 +322,8 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace( ; EG-DAG: 16 ; EG-DAG: 16 ; EG-DAG: 16 -define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { - %load = load <8 x i16>, <8 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { + %load = load <8 x i16>, <8 x i16> addrspace(4)* %in %ext = sext <8 x i16> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out ret void @@ -337,8 +337,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace( ; v16i16 is naturally 32 byte aligned ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1 -define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { - %load = load <16 x i16>, <16 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { + %load = load <16 x i16>, <16 x i16> addrspace(4)* %in %ext = zext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out ret void @@ -352,8 +352,8 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa ; v16i16 is naturally 32 byte aligned ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1 -define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { - %load = load <16 x i16>, <16 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { + %load = load <16 x i16>, <16 x i16> addrspace(4)* %in %ext = sext <16 x i16> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out ret void @@ -369,8 +369,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspa ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 -define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { - %load = load <32 x i16>, <32 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { + %load = load <32 x i16>, <32 x i16> addrspace(4)* %in %ext = zext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out ret void @@ -385,8 +385,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 -define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { - %load = load <32 x i16>, <32 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { + %load = load <32 x i16>, <32 x i16> addrspace(4)* %in %ext = sext <32 x i16> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out ret void @@ -404,8 +404,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 -define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { - %load = load <64 x i16>, <64 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { + %load = load <64 x i16>, <64 x i16> addrspace(4)* %in %ext = zext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out ret void @@ -421,8 +421,8 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 -define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { - %load = load <64 x i16>, <64 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { + %load = load <64 x i16>, <64 x i16> addrspace(4)* %in %ext = sext <64 x i16> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out ret void @@ -438,8 +438,8 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { - %a = load i16, i16 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %a = load i16, i16 addrspace(4)* %in %ext = zext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out ret void @@ -464,8 +464,8 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 -define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { - %a = load i16, i16 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %a = load i16, i16 addrspace(4)* %in %ext = sext i16 %a to i64 store i64 %ext, i64 addrspace(1)* %out ret void @@ -475,8 +475,8 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { - %load = load <1 x i16>, <1 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { + %load = load <1 x i16>, <1 x i16> addrspace(4)* %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void @@ -488,8 +488,8 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace( ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: These could be expanded earlier using ASHR 15 ; EG: 31 -define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 { - %load = load <1 x i16>, <1 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { + %load = load <1 x i16>, <1 x i16> addrspace(4)* %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void @@ -498,8 +498,8 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace( ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { - %load = load <2 x i16>, <2 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { + %load = load <2 x i16>, <2 x i16> addrspace(4)* %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void @@ -508,8 +508,8 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace( ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { - %load = load <2 x i16>, <2 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { + %load = load <2 x i16>, <2 x i16> addrspace(4)* %in %ext = sext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void @@ -518,8 +518,8 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace( ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { - %load = load <4 x i16>, <4 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { + %load = load <4 x i16>, <4 x i16> addrspace(4)* %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void @@ -528,8 +528,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace( ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { - %load = load <4 x i16>, <4 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { + %load = load <4 x i16>, <4 x i16> addrspace(4)* %in %ext = sext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void @@ -538,8 +538,8 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace( ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { - %load = load <8 x i16>, <8 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { + %load = load <8 x i16>, <8 x i16> addrspace(4)* %in %ext = zext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void @@ -548,8 +548,8 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace( ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { - %load = load <8 x i16>, <8 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { + %load = load <8 x i16>, <8 x i16> addrspace(4)* %in %ext = sext <8 x i16> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void @@ -559,8 +559,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace( ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { - %load = load <16 x i16>, <16 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { + %load = load <16 x i16>, <16 x i16> addrspace(4)* %in %ext = zext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void @@ -570,8 +570,8 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { - %load = load <16 x i16>, <16 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { + %load = load <16 x i16>, <16 x i16> addrspace(4)* %in %ext = sext <16 x i16> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void @@ -583,8 +583,8 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { - %load = load <32 x i16>, <32 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { + %load = load <32 x i16>, <32 x i16> addrspace(4)* %in %ext = zext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void @@ -596,8 +596,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 -define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { - %load = load <32 x i16>, <32 x i16> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { + %load = load <32 x i16>, <32 x i16> addrspace(4)* %in %ext = sext <32 x i16> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void @@ -606,16 +606,16 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa ; These trigger undefined register machine verifier errors ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64: -; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { -; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in +; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { +; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in ; %ext = zext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out ; ret void ; } ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64: -; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { -; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in +; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { +; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in ; %ext = sext <64 x i16> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out ; ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 7370d45ca6b..1aa66728136 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -7,9 +7,9 @@ ; GCN: s_load_dword s{{[0-9]+}} ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 { entry: - %ld = load i32, i32 addrspace(2)* %in + %ld = load i32, i32 addrspace(4)* %in store i32 %ld, i32 addrspace(1)* %out ret void } @@ -18,9 +18,9 @@ entry: ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 -define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 { entry: - %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in + %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in store <2 x i32> %ld, <2 x i32> addrspace(1)* %out ret void } @@ -29,9 +29,9 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(4)* %in) #0 { entry: - %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in + %ld = load <3 x i32>, <3 x i32> addrspace(4)* %in store <3 x i32> %ld, <3 x i32> addrspace(1)* %out ret void } @@ -40,9 +40,9 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 { entry: - %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in + %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in store <4 x i32> %ld, <4 x i32> addrspace(1)* %out ret void } @@ -52,9 +52,9 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 { entry: - %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in + %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in store <8 x i32> %ld, <8 x i32> addrspace(1)* %out ret void } @@ -66,9 +66,9 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 { entry: - %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in + %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in store <16 x i32> %ld, <16 x i32> addrspace(1)* %out ret void } @@ -81,8 +81,8 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG: CF_END ; EG: VTX_READ_32 -define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { - %ld = load i32, i32 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 { + %ld = load i32, i32 addrspace(4)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out ret void @@ -98,8 +98,8 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, ; EG: VTX_READ_32 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 -define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { - %ld = load i32, i32 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 { + %ld = load i32, i32 addrspace(4)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out ret void @@ -108,8 +108,8 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64: ; GCN: s_load_dword ; GCN: store_dwordx2 -define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { - %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 { + %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void @@ -119,8 +119,8 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace( ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31 ; GCN: store_dwordx2 -define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 { - %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 { + %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void @@ -129,8 +129,8 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace( ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64: ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { - %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 { + %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void @@ -143,8 +143,8 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace( ; GCN-DAG: s_ashr_i32 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 { - %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 { + %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void @@ -155,8 +155,8 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace( ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { - %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 { + %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void @@ -172,8 +172,8 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace( ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 { - %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 { + %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void @@ -191,8 +191,8 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace( ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-SA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { - %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 { + %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void @@ -219,8 +219,8 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace( ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 { - %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 { + %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void @@ -240,8 +240,8 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace( ; GCN: store_dwordx4 ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { - %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 { + %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void @@ -267,8 +267,8 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspa ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 ; GCN-HSA: flat_store_dwordx4 -define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 { - %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 { + %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void @@ -319,8 +319,8 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspa ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { - %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 { + %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void @@ -370,8 +370,8 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspa ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 ; GCN-HSA-DAG: flat_store_dwordx4 -define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 { - %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 { + %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 14e50ea4c3c..6a493539d78 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -7,8 +7,8 @@ ; FUNC-LABEL: {{^}}constant_load_i64: ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; EG: VTX_READ_64 -define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 { - %ld = load i64, i64 addrspace(2)* %in +define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(4)* %in) #0 { + %ld = load i64, i64 addrspace(4)* %in store i64 %ld, i64 addrspace(1)* %out ret void } @@ -17,9 +17,9 @@ define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspa ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(4)* %in) #0 { entry: - %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in + %ld = load <2 x i64>, <2 x i64> addrspace(4)* %in store <2 x i64> %ld, <2 x i64> addrspace(1)* %out ret void } @@ -29,9 +29,9 @@ entry: ; EG-DAG: VTX_READ_128 ; EG-DAG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(4)* %in) #0 { entry: - %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in + %ld = load <3 x i64>, <3 x i64> addrspace(4)* %in store <3 x i64> %ld, <3 x i64> addrspace(1)* %out ret void } @@ -41,9 +41,9 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(4)* %in) #0 { entry: - %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in + %ld = load <4 x i64>, <4 x i64> addrspace(4)* %in store <4 x i64> %ld, <4 x i64> addrspace(1)* %out ret void } @@ -55,9 +55,9 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(4)* %in) #0 { entry: - %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in + %ld = load <8 x i64>, <8 x i64> addrspace(4)* %in store <8 x i64> %ld, <8 x i64> addrspace(1)* %out ret void } @@ -74,9 +74,9 @@ entry: ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(4)* %in) #0 { entry: - %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in + %ld = load <16 x i64>, <16 x i64> addrspace(4)* %in store <16 x i64> %ld, <16 x i64> addrspace(1)* %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index bdfc3caf9d0..c879924c41c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -10,9 +10,9 @@ ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; TODO: NOT AND -define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { entry: - %ld = load i8, i8 addrspace(2)* %in + %ld = load i8, i8 addrspace(4)* %in store i8 %ld, i8 addrspace(1)* %out ret void } @@ -22,9 +22,9 @@ entry: ; GCN-HSA: flat_load_ushort v ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { entry: - %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in + %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in store <2 x i8> %ld, <2 x i8> addrspace(1)* %out ret void } @@ -33,9 +33,9 @@ entry: ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 { entry: - %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in + %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in store <3 x i8> %ld, <3 x i8> addrspace(1)* %out ret void } @@ -44,9 +44,9 @@ entry: ; GCN: s_load_dword s ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 { entry: - %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in + %ld = load <4 x i8>, <4 x i8> addrspace(4)* %in store <4 x i8> %ld, <4 x i8> addrspace(1)* %out ret void } @@ -55,9 +55,9 @@ entry: ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 { entry: - %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in + %ld = load <8 x i8>, <8 x i8> addrspace(4)* %in store <8 x i8> %ld, <8 x i8> addrspace(1)* %out ret void } @@ -66,9 +66,9 @@ entry: ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 { entry: - %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in + %ld = load <16 x i8>, <16 x i8> addrspace(4)* %in store <16 x i8> %ld, <16 x i8> addrspace(1)* %out ret void } @@ -78,8 +78,8 @@ entry: ; GCN-HSA: flat_load_ubyte ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %a = load i8, i8 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %a = load i8, i8 addrspace(4)* %in %ext = zext i8 %a to i32 store i32 %ext, i32 addrspace(1)* %out ret void @@ -92,8 +92,8 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %ld = load i8, i8 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %ld = load i8, i8 addrspace(4)* %in %ext = sext i8 %ld to i32 store i32 %ext, i32 addrspace(1)* %out ret void @@ -102,8 +102,8 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32: ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { - %load = load <1 x i8>, <1 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 { + %load = load <1 x i8>, <1 x i8> addrspace(4)* %in %ext = zext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out ret void @@ -114,8 +114,8 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal ; EG: 8 -define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { - %load = load <1 x i8>, <1 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 { + %load = load <1 x i8>, <1 x i8> addrspace(4)* %in %ext = sext <1 x i8> %load to <1 x i32> store <1 x i32> %ext, <1 x i32> addrspace(1)* %out ret void @@ -129,8 +129,8 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1 ; TODO: This should use DST, but for some there are redundant MOVs ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG: 8 -define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %load = load <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %in %ext = zext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out ret void @@ -150,8 +150,8 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %load = load <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %in %ext = sext <2 x i8> %load to <2 x i32> store <2 x i32> %ext, <2 x i32> addrspace(1)* %out ret void @@ -170,9 +170,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 { entry: - %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in + %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in %ext = zext <3 x i8> %ld to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out ret void @@ -193,9 +193,9 @@ entry: ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 { entry: - %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in + %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in %ext = sext <3 x i8> %ld to <3 x i32> store <3 x i32> %ext, <3 x i32> addrspace(1)* %out ret void @@ -214,8 +214,8 @@ entry: ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { - %load = load <4 x i8>, <4 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 { + %load = load <4 x i8>, <4 x i8> addrspace(4)* %in %ext = zext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out ret void @@ -236,8 +236,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { - %load = load <4 x i8>, <4 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 { + %load = load <4 x i8>, <4 x i8> addrspace(4)* %in %ext = sext <4 x i8> %load to <4 x i32> store <4 x i32> %ext, <4 x i32> addrspace(1)* %out ret void @@ -264,8 +264,8 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { - %load = load <8 x i8>, <8 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* %in %ext = zext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out ret void @@ -294,8 +294,8 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { - %load = load <8 x i8>, <8 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* %in %ext = sext <8 x i8> %load to <8 x i32> store <8 x i32> %ext, <8 x i32> addrspace(1)* %out ret void @@ -335,8 +335,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1 ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { - %load = load <16 x i8>, <16 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 { + %load = load <16 x i8>, <16 x i8> addrspace(4)* %in %ext = zext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out ret void @@ -378,8 +378,8 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspac ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { - %load = load <16 x i8>, <16 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 { + %load = load <16 x i8>, <16 x i8> addrspace(4)* %in %ext = sext <16 x i8> %load to <16 x i32> store <16 x i32> %ext, <16 x i32> addrspace(1)* %out ret void @@ -450,8 +450,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspac ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { - %load = load <32 x i8>, <32 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 { + %load = load <32 x i8>, <32 x i8> addrspace(4)* %in %ext = zext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out ret void @@ -526,8 +526,8 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspac ; EG-DAG: 8 ; EG-DAG: 8 ; EG-DAG: 8 -define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { - %load = load <32 x i8>, <32 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 { + %load = load <32 x i8>, <32 x i8> addrspace(4)* %in %ext = sext <32 x i8> %load to <32 x i32> store <32 x i32> %ext, <32 x i32> addrspace(1)* %out ret void @@ -539,8 +539,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspac ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { - %load = load <64 x i8>, <64 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 { + %load = load <64 x i8>, <64 x i8> addrspace(4)* %in %ext = zext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out ret void @@ -552,8 +552,8 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspac ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1 -define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { - %load = load <64 x i8>, <64 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 { + %load = load <64 x i8>, <64 x i8> addrspace(4)* %in %ext = sext <64 x i8> %load to <64 x i32> store <64 x i32> %ext, <64 x i32> addrspace(1)* %out ret void @@ -570,8 +570,8 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspac ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %a = load i8, i8 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %a = load i8, i8 addrspace(4)* %in %ext = zext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out ret void @@ -589,8 +589,8 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %a = load i8, i8 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %a = load i8, i8 addrspace(4)* %in %ext = sext i8 %a to i64 store i64 %ext, i64 addrspace(1)* %out ret void @@ -600,8 +600,8 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG: MOV {{.*}}, 0.0 -define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { - %load = load <1 x i8>, <1 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 { + %load = load <1 x i8>, <1 x i8> addrspace(4)* %in %ext = zext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void @@ -613,8 +613,8 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal ; TODO: Why not 7 ? ; EG: 31 -define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { - %load = load <1 x i8>, <1 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 { + %load = load <1 x i8>, <1 x i8> addrspace(4)* %in %ext = sext <1 x i8> %load to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out ret void @@ -623,8 +623,8 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %load = load <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %in %ext = zext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void @@ -633,8 +633,8 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1 ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %load = load <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %in %ext = sext <2 x i8> %load to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out ret void @@ -643,8 +643,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { - %load = load <4 x i8>, <4 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 { + %load = load <4 x i8>, <4 x i8> addrspace(4)* %in %ext = zext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void @@ -653,8 +653,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1 ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { - %load = load <4 x i8>, <4 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 { + %load = load <4 x i8>, <4 x i8> addrspace(4)* %in %ext = sext <4 x i8> %load to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out ret void @@ -663,8 +663,8 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { - %load = load <8 x i8>, <8 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* %in %ext = zext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void @@ -673,8 +673,8 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1 ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { - %load = load <8 x i8>, <8 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* %in %ext = sext <8 x i8> %load to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out ret void @@ -683,8 +683,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { - %load = load <16 x i8>, <16 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 { + %load = load <16 x i8>, <16 x i8> addrspace(4)* %in %ext = zext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void @@ -693,8 +693,8 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspac ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { - %load = load <16 x i8>, <16 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 { + %load = load <16 x i8>, <16 x i8> addrspace(4)* %in %ext = sext <16 x i8> %load to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out ret void @@ -704,8 +704,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspac ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { - %load = load <32 x i8>, <32 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 { + %load = load <32 x i8>, <32 x i8> addrspace(4)* %in %ext = zext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void @@ -715,24 +715,24 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspac ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { - %load = load <32 x i8>, <32 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 { + %load = load <32 x i8>, <32 x i8> addrspace(4)* %in %ext = sext <32 x i8> %load to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out ret void } ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64: -; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { -; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in +; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 { +; %load = load <64 x i8>, <64 x i8> addrspace(4)* %in ; %ext = zext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out ; ret void ; } ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64: -; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { -; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in +; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 { +; %load = load <64 x i8>, <64 x i8> addrspace(4)* %in ; %ext = sext <64 x i8> %load to <64 x i64> ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out ; ret void @@ -744,8 +744,8 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspac ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]], ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] -define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %a = load i8, i8 addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %a = load i8, i8 addrspace(4)* %in %ext = zext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out ret void @@ -759,16 +759,16 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]] ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %a = load i8, i8 addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %a = load i8, i8 addrspace(4)* %in %ext = sext i8 %a to i16 store i16 %ext, i16 addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16: -define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { - %load = load <1 x i8>, <1 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 { + %load = load <1 x i8>, <1 x i8> addrspace(4)* %in %ext = zext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out ret void @@ -778,8 +778,8 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 { - %load = load <1 x i8>, <1 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(4)* %in) #0 { + %load = load <1 x i8>, <1 x i8> addrspace(4)* %in %ext = sext <1 x i8> %load to <1 x i16> store <1 x i16> %ext, <1 x i16> addrspace(1)* %out ret void @@ -788,8 +788,8 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16: ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %load = load <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %in %ext = zext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out ret void @@ -800,8 +800,8 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %load = load <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %in %ext = sext <2 x i8> %load to <2 x i16> store <2 x i16> %ext, <2 x i16> addrspace(1)* %out ret void @@ -810,8 +810,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16: ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { - %load = load <4 x i8>, <4 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 { + %load = load <4 x i8>, <4 x i8> addrspace(4)* %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out ret void @@ -824,8 +824,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 { - %load = load <4 x i8>, <4 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(4)* %in) #0 { + %load = load <4 x i8>, <4 x i8> addrspace(4)* %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, <4 x i16> addrspace(1)* %out ret void @@ -834,8 +834,8 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16: ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { - %load = load <8 x i8>, <8 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out ret void @@ -853,8 +853,8 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { - %load = load <8 x i8>, <8 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(4)* %in) #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, <8 x i16> addrspace(1)* %out ret void @@ -863,8 +863,8 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16: ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 -define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { - %load = load <16 x i8>, <16 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 { + %load = load <16 x i8>, <16 x i8> addrspace(4)* %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out ret void @@ -889,8 +889,8 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspac ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 { - %load = load <16 x i8>, <16 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(4)* %in) #0 { + %load = load <16 x i8>, <16 x i8> addrspace(4)* %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, <16 x i16> addrspace(1)* %out ret void @@ -900,8 +900,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspac ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 -define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { - %load = load <32 x i8>, <32 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 { + %load = load <32 x i8>, <32 x i8> addrspace(4)* %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out ret void @@ -943,24 +943,24 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspac ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal -define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 { - %load = load <32 x i8>, <32 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(4)* %in) #0 { + %load = load <32 x i8>, <32 x i8> addrspace(4)* %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, <32 x i16> addrspace(1)* %out ret void } ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16: -; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { -; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in +; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 { +; %load = load <64 x i8>, <64 x i8> addrspace(4)* %in ; %ext = zext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out ; ret void ; } ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16: -; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 { -; %load = load <64 x i8>, <64 x i8> addrspace(2)* %in +; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(4)* %in) #0 { +; %load = load <64 x i8>, <64 x i8> addrspace(4)* %in ; %ext = sext <64 x i8> %load to <64 x i16> ; store <64 x i16> %ext, <64 x i16> addrspace(1)* %out ; ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index 6ca009bf7f1..94b4133dc2f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -473,10 +473,10 @@ entry: ; GFX9-NEXT: s_setpc_b64 ; VI: flat_load_ushort -define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(2)* %in, i16 %reg) #0 { +define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047 - %load = load i16, i16 addrspace(2)* %gep + %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 + %load = load i16, i16 addrspace(4)* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -492,10 +492,10 @@ entry: ; GFX9-NEXT: s_setpc_b64 ; VI: flat_load_ushort -define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(2)* %in, half %reg) #0 { +define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { entry: - %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047 - %load = load half, half addrspace(2)* %gep + %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 + %load = load half, half addrspace(4)* %gep %build0 = insertelement <2 x half> undef, half %reg, i32 0 %build1 = insertelement <2 x half> %build0, half %load, i32 1 store <2 x half> %build1, <2 x half> addrspace(1)* undef @@ -625,11 +625,11 @@ entry: ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 -define <2 x i16> @load_constant_v2i16_split(i16 addrspace(2)* %in) #0 { +define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { entry: - %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 1 - %load0 = load volatile i16, i16 addrspace(2)* %in - %load1 = load volatile i16, i16 addrspace(2)* %gep + %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 + %load0 = load volatile i16, i16 addrspace(4)* %in + %load1 = load volatile i16, i16 addrspace(4)* %gep %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 ret <2 x i16> %build1 diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index 226a55b8f24..eec9144cda2 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -559,11 +559,11 @@ entry: ; GFX9-NEXT: s_setpc_b64 ; VI: flat_load_ushort -define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(2)* %in, i32 %reg) #0 { +define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047 - %load = load i16, i16 addrspace(2)* %gep + %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 + %load = load i16, i16 addrspace(4)* %gep %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef ret void @@ -578,11 +578,11 @@ entry: ; GFX9-NEXT: s_setpc_b64 ; VI: flat_load_ushort -define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(2)* %in, i32 %reg) #0 { +define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> - %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047 - %load = load half, half addrspace(2)* %gep + %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 + %load = load half, half addrspace(4)* %gep %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 store <2 x half> %build1, <2 x half> addrspace(1)* undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index 176d1d25f19..88e6a3b7fbc 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -5,17 +5,17 @@ declare i32 @llvm.amdgcn.workgroup.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]] ; GCN: v_mad_u32_u24 v{{[0-9]+}}, s8, [[VWGSIZEX]], v0 define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* - %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1 - %workgroup.size.xy = load i32, i32 addrspace(2)* %gep, align 4, !invariant.load !0 + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %cast.dispatch.ptr = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* + %gep = getelementptr inbounds i32, i32 addrspace(4)* %cast.dispatch.ptr, i64 1 + %workgroup.size.xy = load i32, i32 addrspace(4)* %gep, align 4, !invariant.load !0 %workgroup.size.x = and i32 %workgroup.size.xy, 65535 %workitem.id.x = call i32 @llvm.amdgcn.workitem.id.x(), !range !1 diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll index 9dd9c640f4b..9b00507a9c8 100644 --- a/llvm/test/CodeGen/AMDGPU/missing-store.ll +++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s -@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8 +@ptr_load = addrspace(3) global i32 addrspace(4)* undef, align 8 ; Make sure when the load from %ptr2 is folded the chain isn't lost, ; resulting in losing the store to gptr @@ -16,11 +16,11 @@ ; SI: buffer_store_dword ; SI: s_endpgm define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @ptr_load, align 8 + %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 2 store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4 store i32 %tmp2, i32 addrspace(1)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll index 7d26ab88c70..d6fad3695d1 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll @@ -5,40 +5,40 @@ ; CHECK-LABEL: {{^}}test_none: ; CHECK: buffer_load_format_x v0, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -define amdgpu_vs float @test_none(<4 x i32> addrspace(2)* inreg %base, i32 %i) { +define amdgpu_vs float @test_none(<4 x i32> addrspace(4)* inreg %base, i32 %i) { main_body: - %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i - %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32 + %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i + %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i1 0, i1 0) ret float %tmp7 } ; CHECK-LABEL: {{^}}test_idxen: ; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen{{$}} -define amdgpu_vs float @test_idxen(<4 x i32> addrspace(2)* inreg %base, i32 %i) { +define amdgpu_vs float @test_idxen(<4 x i32> addrspace(4)* inreg %base, i32 %i) { main_body: - %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i - %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32 + %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i + %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i1 0, i1 0) ret float %tmp7 } ; CHECK-LABEL: {{^}}test_offen: ; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} -define amdgpu_vs float @test_offen(<4 x i32> addrspace(2)* inreg %base, i32 %i) { +define amdgpu_vs float @test_offen(<4 x i32> addrspace(4)* inreg %base, i32 %i) { main_body: - %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i - %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32 + %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i + %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 undef, i1 0, i1 0) ret float %tmp7 } ; CHECK-LABEL: {{^}}test_both: ; CHECK: buffer_load_format_x v0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen{{$}} -define amdgpu_vs float @test_both(<4 x i32> addrspace(2)* inreg %base, i32 %i) { +define amdgpu_vs float @test_both(<4 x i32> addrspace(4)* inreg %base, i32 %i) { main_body: - %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i - %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32 + %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i + %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32 %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i1 0, i1 0) ret float %tmp7 } diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll index 97666492e37..d5c22d22cca 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll @@ -55,10 +55,10 @@ entry: ; CHECK-LABEL: {{^}}soffset_max_imm: ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc -define amdgpu_gs void @soffset_max_imm([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { +define amdgpu_gs void @soffset_max_imm([6 x <4 x i32>] addrspace(4)* byval, [17 x <4 x i32>] addrspace(4)* byval, [16 x <4 x i32>] addrspace(4)* byval, [32 x <8 x i32>] addrspace(4)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: - %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0 + %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 + %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 @@ -74,10 +74,10 @@ main_body: ; CHECK-LABEL: {{^}}soffset_no_fold: ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc -define amdgpu_gs void @soffset_no_fold([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { +define amdgpu_gs void @soffset_no_fold([6 x <4 x i32>] addrspace(4)* byval, [17 x <4 x i32>] addrspace(4)* byval, [16 x <4 x i32>] addrspace(4)* byval, [32 x <8 x i32>] addrspace(4)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: - %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0 + %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 + %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index b00fff7dc18..67a47dae0c7 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -642,12 +642,12 @@ uniform.multi.exit.region: br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1 uniform.if: - %sgpr0 = load volatile i32, i32 addrspace(2)* undef + %sgpr0 = load volatile i32, i32 addrspace(4)* undef %uniform.cond1 = icmp slt i32 %sgpr0, 1 br i1 %uniform.cond1, label %uniform.then, label %uniform.endif uniform.then: - %sgpr1 = load volatile i32, i32 addrspace(2)* undef + %sgpr1 = load volatile i32, i32 addrspace(4)* undef %uniform.cond2 = icmp sge i32 %sgpr1, 4 store volatile i32 33, i32 addrspace(1)* undef br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif diff --git a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll index 1c5d21c16e0..88372873c6e 100644 --- a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll +++ b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll @@ -6,21 +6,21 @@ ; EG: R_AMDGPU_ABS32 extern_const_addrspace ; CHECK-DAG: Name: extern_const_addrspace -@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4 +@extern_const_addrspace = external unnamed_addr addrspace(4) constant [5 x i32], align 4 ; CHECK-DAG: Name: load_extern_const_init define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4 + %val = load i32, i32 addrspace(4)* getelementptr ([5 x i32], [5 x i32] addrspace(4)* @extern_const_addrspace, i64 0, i64 3), align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void } ; CHECK-DAG: Name: undef_const_addrspace -@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4 +@undef_const_addrspace = unnamed_addr addrspace(4) constant [5 x i32] undef, align 4 ; CHECK-DAG: Name: undef_const_addrspace define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4 + %val = load i32, i32 addrspace(4)* getelementptr ([5 x i32], [5 x i32] addrspace(4)* @undef_const_addrspace, i64 0, i64 3), align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll index 500e4cb3cc7..48b9c13c752 100644 --- a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -194,9 +194,9 @@ define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0 ; SI: s_waitcnt lgkmcnt(0) ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff -define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(4)* %in) { entry: - %val = load i32, i32 addrspace(2)* %in + %val = load i32, i32 addrspace(4)* %in %mask = and i32 %val, 65535 store i32 %mask, i32 addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll index b7811064fb2..1013349e781 100644 --- a/llvm/test/CodeGen/AMDGPU/nullptr.ll +++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs | FileCheck -check-prefixes=CHECK,GCN %s ;RUN: llc < %s -march=r600 -mtriple=r600---amdgiz -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s -%struct.S = type { i32 addrspace(5)*, i32 addrspace(1)*, i32 addrspace(2)*, i32 addrspace(3)*, i32*, i32 addrspace(4)*} +%struct.S = type { i32 addrspace(5)*, i32 addrspace(1)*, i32 addrspace(4)*, i32 addrspace(3)*, i32*, i32 addrspace(2)*} ; CHECK-LABEL: nullptr_priv: ; CHECK-NEXT: .long 0 @@ -15,7 +15,7 @@ ; CHECK-LABEL: nullptr_const: ; GCN-NEXT: .quad 0 ; R600-NEXT: .long 0 -@nullptr_const = global i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*) +@nullptr_const = global i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*) ; CHECK-LABEL: nullptr_local: ; CHECK-NEXT: .long -1 @@ -23,7 +23,7 @@ ; CHECK-LABEL: nullptr_region: ; CHECK-NEXT: .long -1 -@nullptr_region = global i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*) +@nullptr_region = global i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*) ; CHECK-LABEL: nullptr6: ; R600-NEXT: .long 0 @@ -113,7 +113,7 @@ @structWithPointers = addrspace(1) global %struct.S { i32 addrspace(5)* addrspacecast (i32* null to i32 addrspace(5)*), i32 addrspace(1)* addrspacecast (i32* null to i32 addrspace(1)*), - i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*), + i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*), i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*), i32* null, - i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*)}, align 4 + i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*)}, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index c50d3f7010a..3eab7d569ef 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -8,9 +8,9 @@ ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] ; GFX9: ; use [[PACKED]] -define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { - %val0 = load volatile i32, i32 addrspace(2)* %in0 - %val1 = load volatile i32, i32 addrspace(2)* %in1 +define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 { + %val0 = load volatile i32, i32 addrspace(4)* %in0 + %val1 = load volatile i32, i32 addrspace(4)* %in1 %lo.i = trunc i32 %val0 to i16 %hi.i = trunc i32 %val1 to i16 %lo = bitcast i16 %lo.i to half @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2) ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]] ; GFX9: ; use [[PACKED]] -define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 { - %val1 = load i32, i32 addrspace(2)* %in1 +define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(4)* %in1) #0 { + %val1 = load i32, i32 addrspace(4)* %in1 %hi.i = trunc i32 %val1 to i16 %hi = bitcast i16 %hi.i to half %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0 @@ -43,8 +43,8 @@ define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 { ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234 ; GFX9: ; use [[PACKED]] -define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 { - %val0 = load i32, i32 addrspace(2)* %in0 +define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 { + %val0 = load i32, i32 addrspace(4)* %in0 %lo.i = trunc i32 %val0 to i16 %lo = bitcast i16 %lo.i to half %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index 343b94b06bd..571ce98a053 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -8,9 +8,9 @@ ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] ; GFX9: ; use [[PACKED]] -define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { - %val0 = load volatile i32, i32 addrspace(2)* %in0 - %val1 = load volatile i32, i32 addrspace(2)* %in1 +define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 { + %val0 = load volatile i32, i32 addrspace(4)* %in0 + %val1 = load volatile i32, i32 addrspace(4)* %in1 %lo = trunc i32 %val0 to i16 %hi = trunc i32 %val1 to i16 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 @@ -25,8 +25,8 @@ define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2) ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]] ; GFX9: ; use [[PACKED]] -define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 { - %val1 = load i32, i32 addrspace(2)* %in1 +define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(4)* %in1) #0 { + %val1 = load i32, i32 addrspace(4)* %in1 %hi = trunc i32 %val1 to i16 %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 @@ -40,8 +40,8 @@ define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 { ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8 ; GFX9: ; use [[PACKED]] -define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 { - %val0 = load i32, i32 addrspace(2)* %in0 +define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(4)* %in0) #0 { + %val0 = load i32, i32 addrspace(4)* %in0 %lo = trunc i32 %val0 to i16 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll b/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll index d5bf9c88b78..9bab3bc74c1 100644 --- a/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll +++ b/llvm/test/CodeGen/AMDGPU/r600-constant-array-fixup.ll @@ -1,6 +1,6 @@ ; RUN: llc -filetype=obj -march=r600 -mcpu=cypress -verify-machineinstrs < %s | llvm-readobj -relocations -symbols | FileCheck %s -@arr = internal unnamed_addr addrspace(2) constant [4 x i32] [i32 4, i32 5, i32 6, i32 7], align 4 +@arr = internal unnamed_addr addrspace(4) constant [4 x i32] [i32 4, i32 5, i32 6, i32 7], align 4 ; CHECK: Relocations [ ; CHECK: Section (3) .rel.text { @@ -19,8 +19,8 @@ ; CHECK: } define amdgpu_kernel void @test_constant_array_fixup(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { entry: - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @arr, i32 0, i32 %idx - %val = load i32, i32 addrspace(2)* %arrayidx + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(4)* @arr, i32 0, i32 %idx + %val = load i32, i32 addrspace(4)* %arrayidx store i32 %val, i32 addrspace(1)* %out, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll index d7b353cd25d..28d84b33a71 100644 --- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -28,9 +28,9 @@ define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { ; SI-DAG: s_memtime ; VI-DAG: s_memrealtime ; GCN-DAG: s_load_dword -define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(2)* inreg %in) #0 { +define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(4)* inreg %in) #0 { %cycle0 = call i64 @llvm.readcyclecounter() - %in.v = load i64, i64 addrspace(2)* %in + %in.v = load i64, i64 addrspace(4)* %in %r.64 = add i64 %cycle0, %in.v %r.32 = trunc i64 %r.64 to i32 ret i32 %r.32 diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll index 2d673a9b0cd..e9007869b9f 100644 --- a/llvm/test/CodeGen/AMDGPU/ret.ll +++ b/llvm/test/CodeGen/AMDGPU/ret.ll @@ -7,7 +7,7 @@ ; GCN: s_waitcnt expcnt(0) ; GCN: v_add_f32_e32 v0, 1.0, v0 ; GCN-NOT: s_endpgm -define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 %x = fadd float %arg3, 1.000000e+00 @@ -26,7 +26,7 @@ bb: ; GCN-DAG: v_mov_b32_e32 v3, -1.0 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 } @@ -43,7 +43,7 @@ bb: ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v6 ; GCN-NOT: s_endpgm -define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { bb: %i0 = extractelement <2 x i32> %arg4, i32 0 %i1 = extractelement <2 x i32> %arg4, i32 1 @@ -68,7 +68,7 @@ bb: ; GCN-LABEL: {{^}}ps_input_ena_no_inputs: ; GCN: v_mov_b32_e32 v0, 1.0 ; GCN-NOT: s_endpgm -define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { bb: ret float 1.000000e+00 } @@ -82,7 +82,7 @@ bb: ; GCN-DAG: v_mov_b32_e32 v1, v2 ; GCN: v_mov_b32_e32 v2, v3 ; GCN-NOT: s_endpgm -define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { +define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 { bb: %f = bitcast <2 x i32> %arg8 to <2 x float> %s = insertvalue { float, <2 x float> } undef, float %arg14, 0 @@ -101,7 +101,7 @@ bb: ; GCN-DAG: v_mov_b32_e32 v3, v6 ; GCN-DAG: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 { +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 { bb: %i0 = extractelement <2 x i32> %arg4, i32 0 %i1 = extractelement <2 x i32> %arg4, i32 1 @@ -130,7 +130,7 @@ bb: ; GCN: v_mov_b32_e32 v3, v8 ; GCN: v_mov_b32_e32 v4, v12 ; GCN-NOT: s_endpgm -define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 { +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 { bb: %i0 = extractelement <2 x i32> %arg4, i32 0 %i1 = extractelement <2 x i32> %arg4, i32 1 @@ -159,7 +159,7 @@ bb: ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 { +define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 { bb: %i0 = extractelement <2 x i32> %arg4, i32 0 %i1 = extractelement <2 x i32> %arg4, i32 1 @@ -181,7 +181,7 @@ bb: ; GCN: s_add_i32 s0, s3, 2 ; GCN: s_mov_b32 s2, s3 ; GCN-NOT: s_endpgm -define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: %x = add i32 %arg2, 2 %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0 @@ -197,7 +197,7 @@ bb: ; GCN-DAG: s_mov_b32 s2, 7 ; GCN-DAG: s_mov_b32 s3, 8 ; GCN-NOT: s_endpgm -define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: %x = add i32 %arg2, 2 ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 } @@ -212,7 +212,7 @@ bb: ; GCN-DAG: s_add_i32 s0, s3, 2 ; GCN-DAG: s_mov_b32 s2, s3 ; GCN-NOT: s_endpgm -define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 %v = fadd float %arg3, 1.000000e+00 @@ -235,7 +235,7 @@ bb: ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 ; GCN: s_waitcnt expcnt(0) -define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { +define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 { bb: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0 ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> <float 2.000000e+00, float 4.000000e+00> } } diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll index a826661442e..9b46962108c 100644 --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -65,24 +65,24 @@ done: ; preds = %loop ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]] ; GCN-NOHSA: buffer_store_dword [[V_OUT]] ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]] -define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { +define amdgpu_kernel void @smrd_valu(i32 addrspace(4)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 { entry: %tmp = icmp ne i32 %a, 0 br i1 %tmp, label %if, label %else if: ; preds = %entry - %tmp1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + %tmp1 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in br label %endif else: ; preds = %entry - %tmp2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in - %tmp3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %tmp2 + %tmp2 = getelementptr i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in + %tmp3 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %tmp2 br label %endif endif: ; preds = %else, %if - %tmp4 = phi i32 addrspace(2)* [ %tmp1, %if ], [ %tmp3, %else ] - %tmp5 = getelementptr i32, i32 addrspace(2)* %tmp4, i32 3000 - %tmp6 = load i32, i32 addrspace(2)* %tmp5 + %tmp4 = phi i32 addrspace(4)* [ %tmp1, %if ], [ %tmp3, %else ] + %tmp5 = getelementptr i32, i32 addrspace(4)* %tmp4, i32 3000 + %tmp6 = load i32, i32 addrspace(4)* %tmp5 store i32 %tmp6, i32 addrspace(1)* %out ret void } @@ -93,12 +93,12 @@ endif: ; preds = %else, %if ; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}} ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 - %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4 - %tmp3 = load i32, i32 addrspace(2)* %tmp2 + %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4 + %tmp3 = load i32, i32 addrspace(4)* %tmp2 store i32 %tmp3, i32 addrspace(1)* %out ret void } @@ -113,12 +113,12 @@ entry: ; GCN-NOHSA: buffer_store_dword ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} -define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp - %tmp3 = getelementptr i32, i32 addrspace(2)* %tmp2, i32 5000 - %tmp4 = load i32, i32 addrspace(2)* %tmp3 + %tmp2 = getelementptr i32, i32 addrspace(4)* %in, i32 %tmp + %tmp3 = getelementptr i32, i32 addrspace(4)* %tmp2, i32 5000 + %tmp4 = load i32, i32 addrspace(4)* %tmp3 %tmp5 = add i32 %tmp4, %c store i32 %tmp5, i32 addrspace(1)* %out ret void @@ -133,12 +133,12 @@ entry: ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: buffer_store_dwordx2 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(4)* %in, i64 %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp - %tmp3 = getelementptr i64, i64 addrspace(2)* %tmp2, i32 5000 - %tmp4 = load i64, i64 addrspace(2)* %tmp3 + %tmp2 = getelementptr i64, i64 addrspace(4)* %in, i32 %tmp + %tmp3 = getelementptr i64, i64 addrspace(4)* %tmp2, i32 5000 + %tmp4 = load i64, i64 addrspace(4)* %tmp3 %tmp5 = or i64 %tmp4, %c store i64 %tmp5, i64 addrspace(1)* %out ret void @@ -155,12 +155,12 @@ entry: ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in, <4 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp - %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %tmp2, i32 1234 - %tmp4 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp3 + %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %in, i32 %tmp + %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %tmp2, i32 1234 + %tmp4 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp3 %tmp5 = or <4 x i32> %tmp4, %c store <4 x i32> %tmp5, <4 x i32> addrspace(1)* %out ret void @@ -189,12 +189,12 @@ entry: ; GCN-NOHSA: buffer_store_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in, <8 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp - %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %tmp2, i32 1234 - %tmp4 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp3 + %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %in, i32 %tmp + %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %tmp2, i32 1234 + %tmp4 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp3 %tmp5 = or <8 x i32> %tmp4, %c store <8 x i32> %tmp5, <8 x i32> addrspace(1)* %out ret void @@ -230,12 +230,12 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN: s_endpgm -define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { +define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in, <16 x i32> %c) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() - %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp - %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %tmp2, i32 1234 - %tmp4 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp3 + %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %in, i32 %tmp + %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %tmp2, i32 1234 + %tmp4 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp3 %tmp5 = or <16 x i32> %tmp4, %c store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out ret void @@ -247,12 +247,12 @@ entry: ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]] ; GCN-NOHSA: buffer_store_dword [[ADD]] ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]] -define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 { +define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in, i32 %a) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 - %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4 - %tmp3 = load i32, i32 addrspace(2)* %tmp2 + %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4 + %tmp3 = load i32, i32 addrspace(4)* %tmp2 %tmp4 = add i32 %tmp3, %a store i32 %tmp4, i32 addrspace(1)* %out ret void @@ -261,12 +261,12 @@ entry: ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}} ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}} -define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 - %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 255 - %tmp3 = load i32, i32 addrspace(2)* %tmp2 + %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 255 + %tmp3 = load i32, i32 addrspace(4)* %tmp2 store i32 %tmp3, i32 addrspace(1)* %out ret void } @@ -275,12 +275,12 @@ entry: ; GCN-NOHSA-NOT: v_add ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}} ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}] -define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 { +define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp, 4 - %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 256 - %tmp3 = load i32, i32 addrspace(2)* %tmp2 + %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 256 + %tmp3 = load i32, i32 addrspace(4)* %tmp2 store i32 %tmp3, i32 addrspace(1)* %out ret void } @@ -290,12 +290,12 @@ entry: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* - %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 + %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)* + %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4 store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32 ret void } @@ -313,12 +313,12 @@ entry: ; GCN-NOHSA: buffer_store_dword ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* - %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 + %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)* + %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4 %elt0 = extractelement <8 x i32> %tmp3, i32 0 %elt1 = extractelement <8 x i32> %tmp3, i32 1 @@ -350,12 +350,12 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* - %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4 + %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)* + %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4 store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32 ret void } @@ -385,12 +385,12 @@ entry: ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 { +define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 { entry: %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x() - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* - %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4 + %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)* + %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4 %elt0 = extractelement <16 x i32> %tmp3, i32 0 %elt1 = extractelement <16 x i32> %tmp3, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index 8684ef2b7c2..41023d0b7ab 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -15,11 +15,11 @@ bb: %0 = getelementptr i32, i32 addrspace(1)* %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3 %tmp5 = alloca %struct.wombat, align 16, addrspace(5) - %1 = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - %2 = bitcast i8 addrspace(2)* %1 to i32 addrspace(2)* - %3 = getelementptr inbounds i32, i32 addrspace(2)* %2, i64 1 - %4 = bitcast i32 addrspace(2)* %3 to <2 x i32> addrspace(2)*, !amdgpu.uniform !3, !amdgpu.noclobber !3 - %5 = load <2 x i32>, <2 x i32> addrspace(2)* %4, align 4, !invariant.load !3 + %1 = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %2 = bitcast i8 addrspace(4)* %1 to i32 addrspace(4)* + %3 = getelementptr inbounds i32, i32 addrspace(4)* %2, i64 1 + %4 = bitcast i32 addrspace(4)* %3 to <2 x i32> addrspace(4)*, !amdgpu.uniform !3, !amdgpu.noclobber !3 + %5 = load <2 x i32>, <2 x i32> addrspace(4)* %4, align 4, !invariant.load !3 %6 = extractelement <2 x i32> %5, i32 0 %7 = extractelement <2 x i32> %5, i32 1 %8 = lshr i32 %6, 16 @@ -32,7 +32,7 @@ %15 = add i32 %13, %14 %16 = add i32 %15, %11 %17 = getelementptr inbounds [256 x [16 x i8]], [256 x [16 x i8]] addrspace(3)* @sched_dbg_value_crash.tmp6, i32 0, i32 %16 - %tmp7 = load i64, i64 addrspace(2)* null, align 536870912 + %tmp7 = load i64, i64 addrspace(4)* null, align 536870912 %tmp8 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !4 %tmp9 = zext i32 %tmp8 to i64 %tmp10 = add i64 %tmp7, %tmp9 @@ -141,7 +141,7 @@ declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 + declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 declare i32 @llvm.amdgcn.workitem.id.y() #1 declare i32 @llvm.amdgcn.workitem.id.z() #1 declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i32, i1) #0 @@ -199,9 +199,9 @@ body: | %2:vgpr_32 = COPY $vgpr2 %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 - %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) + %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) + %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 24, 0 %9:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 32, 0 %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3, 4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index 4cf284630c2..b8befd5b751 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -528,8 +528,8 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* % ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 -define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { - %ld = load i32, i32 addrspace(2)* %ptr +define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { + %ld = load i32, i32 addrspace(4)* %ptr %in = trunc i32 %ld to i16 %shl = shl i16 %in, 15 %sext = ashr i16 %shl, 15 @@ -547,8 +547,8 @@ define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addr ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 -define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { - %ld = load i32, i32 addrspace(2)* %ptr +define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { + %ld = load i32, i32 addrspace(4)* %ptr %in = trunc i32 %ld to i16 %shl = shl i16 %in, 14 %sext = ashr i16 %shl, 14 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll index 39cbf9312f0..014318964bf 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -4,10 +4,10 @@ ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] -define amdgpu_ps void @phi1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @phi1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0) %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32) @@ -28,10 +28,10 @@ ENDIF: ; preds = %ELSE, %main_body ; Make sure this program doesn't crash ; CHECK-LABEL: {{^}}phi2: -define amdgpu_ps void @phi2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { +define amdgpu_ps void @phi2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32) %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 36) @@ -47,10 +47,10 @@ main_body: %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 84) %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 88) %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 92) - %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0 - %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0 - %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0 - %tmp39 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp38, !tbaa !0 + %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %arg2, i32 0 + %tmp37 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp36, !tbaa !0 + %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg1, i32 0 + %tmp39 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp38, !tbaa !0 %i.i = extractelement <2 x i32> %arg5, i32 0 %j.i = extractelement <2 x i32> %arg5, i32 1 %i.f.i = bitcast i32 %i.i to float @@ -173,10 +173,10 @@ ENDIF24: ; preds = %IF25, %ENDIF ; We just want ot make sure the program doesn't crash ; CHECK-LABEL: {{^}}loop: -define amdgpu_ps void @loop(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @loop(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0) %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 4) %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 8) @@ -226,15 +226,15 @@ ENDIF: ; preds = %LOOP ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}} ; CHECK: exp ; CHECK: s_endpgm -define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: - %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0 - %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 + %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 16) - %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 - %tmp24 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp23, !tbaa !0 - %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0 - %tmp26 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp25, !tbaa !0 + %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0 + %tmp24 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp23, !tbaa !0 + %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0 + %tmp26 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp25, !tbaa !0 %tmp27 = fcmp oeq float %tmp22, 0.000000e+00 %tmp26.bc = bitcast <4 x i32> %tmp26 to <4 x i32> br i1 %tmp27, label %if, label %else @@ -290,7 +290,7 @@ endif: ; preds = %if1, %if0, %entry ; This test is just checking that we don't crash / assertion fail. ; CHECK-LABEL: {{^}}copy2: ; CHECK: s_endpgm -define amdgpu_ps void @copy2([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @copy2([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: br label %LOOP68 @@ -326,15 +326,15 @@ ENDIF69: ; preds = %LOOP68 ; [[END]]: ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}} ; CHECK: s_endpgm -define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { +define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(4)* byval %arg, [17 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <4 x i32>] addrspace(4)* byval %arg2, [32 x <8 x i32>] addrspace(4)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { bb: - %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i32 0, i32 0 - %tmp22 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !3 + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i32 0, i32 0 + %tmp22 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !3 %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp22, i32 16) - %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0 - %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !3 - %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0 - %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !3 + %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(4)* %arg3, i32 0, i32 0 + %tmp26 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp25, !tbaa !3 + %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg2, i32 0, i32 0 + %tmp28 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp27, !tbaa !3 %i.i = extractelement <2 x i32> %arg7, i32 0 %j.i = extractelement <2 x i32> %arg7, i32 1 %i.f.i = bitcast i32 %i.i to float @@ -382,11 +382,11 @@ bb71: ; preds = %bb80, %bb38 ; Check the resource descriptor is stored in an sgpr. ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { +define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(4)* byval %arg) #0 { bb: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 - %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid - %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 + %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i32 0, i32 %tid + %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0 %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp10 = extractelement <4 x float> %tmp, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10) @@ -397,11 +397,11 @@ bb: ; Check the sampler is stored in an sgpr. ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 { +define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(4)* byval %arg) #0 { bb: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 - %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid - %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0 + %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i32 0, i32 %tid + %tmp8 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp7, align 16, !tbaa !0 %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp10 = extractelement <4 x float> %tmp, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) diff --git a/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll b/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll index 42249806650..3b09e1b7a31 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll +++ b/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll @@ -6,15 +6,15 @@ ; GCN-LABEL: {{^}}main: ; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf -define amdgpu_ps void @main(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @main(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) - %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0 - %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0 - %tmp24 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0 - %tmp25 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp24, !tbaa !0 + %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %arg2, i32 0 + %tmp23 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp22, !tbaa !0 + %tmp24 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg1, i32 0 + %tmp25 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp24, !tbaa !0 %i.i = extractelement <2 x i32> %arg5, i32 0 %j.i = extractelement <2 x i32> %arg5, i32 1 %i.f.i = bitcast i32 %i.i to float diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll index 462528c4ff1..540c8283c94 100644 --- a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll +++ b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll @@ -16,12 +16,12 @@ ; CHECK: s_waitcnt vmcnt(0) ; CHECK: exp ; CHECK: s_endpgm -define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { +define amdgpu_ps void @main([6 x <16 x i8>] addrspace(4)* byval %arg, [17 x <16 x i8>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { main_body: - %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)* - %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0 - %tmp23 = bitcast [17 x <4 x i32>] addrspace(2)* %arg2 to <16 x i8> addrspace(2)* - %tmp24 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp23, align 16, !tbaa !0 + %tmp = bitcast [34 x <8 x i32>] addrspace(4)* %arg3 to <32 x i8> addrspace(4)* + %tmp22 = load <32 x i8>, <32 x i8> addrspace(4)* %tmp, align 32, !tbaa !0 + %tmp23 = bitcast [17 x <4 x i32>] addrspace(4)* %arg2 to <16 x i8> addrspace(4)* + %tmp24 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp23, align 16, !tbaa !0 %i.i = extractelement <2 x i32> %arg11, i32 0 %j.i = extractelement <2 x i32> %arg11, i32 1 %i.f.i = bitcast i32 %i.i to float diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll index 3e70f2c7782..f3f2611855a 100644 --- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -24,10 +24,10 @@ ; GCN: s_endpgm ; TOVGPR: ScratchSize: 0{{$}} -define amdgpu_ps void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { +define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: - %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0 - %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 + %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 96) %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 100) %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 104) @@ -66,39 +66,39 @@ main_body: %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 372) %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 376) %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 384) - %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 - %tmp61 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp60, !tbaa !0 - %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0 - %tmp63 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp62, !tbaa !0 + %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0 + %tmp61 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp60, !tbaa !0 + %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0 + %tmp63 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp62, !tbaa !0 %tmp63.bc = bitcast <4 x i32> %tmp63 to <4 x i32> - %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1 - %tmp65 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp64, !tbaa !0 - %tmp66 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1 - %tmp67 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp66, !tbaa !0 - %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2 - %tmp69 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp68, !tbaa !0 - %tmp70 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2 - %tmp71 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp70, !tbaa !0 - %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3 - %tmp73 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp72, !tbaa !0 - %tmp74 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3 - %tmp75 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp74, !tbaa !0 - %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4 - %tmp77 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp76, !tbaa !0 - %tmp78 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4 - %tmp79 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp78, !tbaa !0 - %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5 - %tmp81 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp80, !tbaa !0 - %tmp82 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5 - %tmp83 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp82, !tbaa !0 - %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6 - %tmp85 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp84, !tbaa !0 - %tmp86 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6 - %tmp87 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp86, !tbaa !0 - %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7 - %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0 - %tmp90 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7 - %tmp91 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp90, !tbaa !0 + %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 1 + %tmp65 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp64, !tbaa !0 + %tmp66 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 1 + %tmp67 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp66, !tbaa !0 + %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 2 + %tmp69 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp68, !tbaa !0 + %tmp70 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 2 + %tmp71 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp70, !tbaa !0 + %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 3 + %tmp73 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp72, !tbaa !0 + %tmp74 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 3 + %tmp75 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp74, !tbaa !0 + %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 4 + %tmp77 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp76, !tbaa !0 + %tmp78 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 4 + %tmp79 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp78, !tbaa !0 + %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 5 + %tmp81 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp80, !tbaa !0 + %tmp82 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 5 + %tmp83 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp82, !tbaa !0 + %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 6 + %tmp85 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp84, !tbaa !0 + %tmp86 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 6 + %tmp87 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp86, !tbaa !0 + %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 7 + %tmp89 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp88, !tbaa !0 + %tmp90 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 7 + %tmp91 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp90, !tbaa !0 %i.i = extractelement <2 x i32> %arg6, i32 0 %j.i = extractelement <2 x i32> %arg6, i32 1 %i.f.i = bitcast i32 %i.i to float @@ -778,10 +778,10 @@ ENDIF66: ; preds = %LOOP65 ; GCN-LABEL: {{^}}main1: ; GCN: s_endpgm ; TOVGPR: ScratchSize: 0{{$}} -define amdgpu_ps void @main1([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @main1([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: - %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0 - %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 + %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 0) %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 4) %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 8) @@ -885,42 +885,42 @@ main_body: %tmp122 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 716) %tmp123 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 864) %tmp124 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 868) - %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0 - %tmp126 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp125, !tbaa !0 - %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0 - %tmp128 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp127, !tbaa !0 - %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1 - %tmp130 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp129, !tbaa !0 - %tmp131 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1 - %tmp132 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp131, !tbaa !0 - %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2 - %tmp134 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp133, !tbaa !0 - %tmp135 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2 - %tmp136 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp135, !tbaa !0 - %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3 - %tmp138 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp137, !tbaa !0 - %tmp139 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3 - %tmp140 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp139, !tbaa !0 - %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4 - %tmp142 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp141, !tbaa !0 - %tmp143 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4 - %tmp144 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp143, !tbaa !0 - %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5 - %tmp146 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp145, !tbaa !0 - %tmp147 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5 - %tmp148 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp147, !tbaa !0 - %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6 - %tmp150 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp149, !tbaa !0 - %tmp151 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6 - %tmp152 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp151, !tbaa !0 - %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7 - %tmp154 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp153, !tbaa !0 - %tmp155 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7 - %tmp156 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp155, !tbaa !0 - %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 8 - %tmp158 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp157, !tbaa !0 - %tmp159 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 8 - %tmp160 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp159, !tbaa !0 + %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 0 + %tmp126 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp125, !tbaa !0 + %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 0 + %tmp128 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp127, !tbaa !0 + %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 1 + %tmp130 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp129, !tbaa !0 + %tmp131 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 1 + %tmp132 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp131, !tbaa !0 + %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 2 + %tmp134 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp133, !tbaa !0 + %tmp135 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 2 + %tmp136 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp135, !tbaa !0 + %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 3 + %tmp138 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp137, !tbaa !0 + %tmp139 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 3 + %tmp140 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp139, !tbaa !0 + %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 4 + %tmp142 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp141, !tbaa !0 + %tmp143 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 4 + %tmp144 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp143, !tbaa !0 + %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 5 + %tmp146 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp145, !tbaa !0 + %tmp147 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 5 + %tmp148 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp147, !tbaa !0 + %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 6 + %tmp150 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp149, !tbaa !0 + %tmp151 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 6 + %tmp152 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp151, !tbaa !0 + %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 7 + %tmp154 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp153, !tbaa !0 + %tmp155 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 7 + %tmp156 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp155, !tbaa !0 + %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(4)* %arg2, i64 0, i32 8 + %tmp158 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp157, !tbaa !0 + %tmp159 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(4)* %arg1, i64 0, i32 8 + %tmp160 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp159, !tbaa !0 %tmp161 = fcmp ugt float %arg17, 0.000000e+00 %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00 %i.i = extractelement <2 x i32> %arg6, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 6d2f89ab96d..7c0f5b9a471 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2 @stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 -@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8 +@stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8 @stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 ; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load: @@ -100,14 +100,14 @@ define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load ; CI: buffer_store_dword ; GFX9: global_store_dword define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @stored_constant_ptr, align 8 - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3 + %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 3 - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4 store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4 %add = add nsw i32 %tmp1, %tmp2 @@ -129,14 +129,14 @@ define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 ; CI: buffer_store_dword ; GFX9: global_store_dword define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @stored_constant_ptr, align 8 - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3 + %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 3 - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4 store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4 %add = add nsw i32 %tmp1, %tmp2 @@ -151,13 +151,13 @@ define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 a ; GCN: ds_write_b32 ; CI: buffer_store_dword ; GFX9: global_store_dword -define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 +define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(4)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 2 - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4 store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4 %add = add nsw i32 %tmp1, %tmp2 diff --git a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll index 0eaa28b39bc..3e0fd566cc7 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -12,10 +12,10 @@ ; GCN: buffer_store_dword ; GCN: [[EXIT]]: ; GCN: s_endpgm -define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) { +define amdgpu_kernel void @vccz_workaround(i32 addrspace(4)* %in, i32 addrspace(1)* %out, float %cond) { entry: %cnd = fcmp oeq float 0.0, %cond - %sgpr = load volatile i32, i32 addrspace(2)* %in + %sgpr = load volatile i32, i32 addrspace(4)* %in br i1 %cnd, label %if, label %endif if: diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index a8eaeab85e1..a326942e43d 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -7,10 +7,10 @@ ; GCN-LABEL: {{^}}smrd0: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { entry: - %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 - %tmp1 = load i32, i32 addrspace(2)* %tmp + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 1 + %tmp1 = load i32, i32 addrspace(4)* %tmp store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -19,10 +19,10 @@ entry: ; GCN-LABEL: {{^}}smrd1: ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}} ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { entry: - %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 - %tmp1 = load i32, i32 addrspace(2)* %tmp + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 255 + %tmp1 = load i32, i32 addrspace(4)* %tmp store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -34,10 +34,10 @@ entry: ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 ; GCN: s_endpgm -define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { entry: - %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 - %tmp1 = load i32, i32 addrspace(2)* %tmp + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 256 + %tmp1 = load i32, i32 addrspace(4)* %tmp store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -49,10 +49,10 @@ entry: ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b ; TODO: Add VI checks ; GCN: s_endpgm -define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { entry: - %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 - %tmp1 = load i32, i32 addrspace(2)* %tmp + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 + %tmp1 = load i32, i32 addrspace(4)* %tmp store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -63,10 +63,10 @@ entry: ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { entry: - %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143 - %tmp1 = load i32, i32 addrspace(2)* %tmp + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143 + %tmp1 = load i32, i32 addrspace(4)* %tmp store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -77,10 +77,10 @@ entry: ; SIVIGFX9: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { +define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) #0 { entry: - %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144 - %tmp1 = load i32, i32 addrspace(2)* %tmp + %tmp = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144 + %tmp1 = load i32, i32 addrspace(4)* %tmp store i32 %tmp1, i32 addrspace(1)* %out ret void } @@ -106,10 +106,10 @@ main_body: ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void @@ -120,10 +120,10 @@ main_body: ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void @@ -137,10 +137,10 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void @@ -152,10 +152,10 @@ main_body: ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void @@ -167,10 +167,10 @@ main_body: ; SIVIGFX9: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: - %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0 - %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp + %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 + %tmp20 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576) call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0 ret void @@ -257,9 +257,9 @@ main_body: ; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted ; GCN: v_readfirstlane -define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), i32) #0 { +define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), i32) #0 { main_body: - %descptr = bitcast [0 x i8] addrspace(2)* %0 to <4 x i32> addrspace(2)*, !amdgpu.uniform !0 + %descptr = bitcast [0 x i8] addrspace(4)* %0 to <4 x i32> addrspace(4)*, !amdgpu.uniform !0 br label %.outer_loop_header ret_block: ; preds = %.outer, %.label22, %main_body @@ -275,7 +275,7 @@ ret_block: ; preds = %.outer, %.label22, % br i1 %inner_br1, label %.inner_loop_body, label %ret_block .inner_loop_body: - %descriptor = load <4 x i32>, <4 x i32> addrspace(2)* %descptr, align 16, !invariant.load !0 + %descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0 %load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0) %inner_br2 = icmp uge i32 %1, 10 br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index a7b522165ab..ad10c7ff756 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -87,7 +87,7 @@ endif: ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 ; GCN-NOT: s_buffer_load_dword m0 -define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %m0) #0 { +define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, i32 inreg %m0) #0 { main_body: %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0) %cmp = fcmp ueq float 0.000000e+00, %tmp @@ -191,7 +191,7 @@ endif: ; TOSMEM: s_endpgm define amdgpu_kernel void @restore_m0_lds(i32 %arg) { %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0 - %sval = load volatile i64, i64 addrspace(2)* undef + %sval = load volatile i64, i64 addrspace(4)* undef %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %ret, label %bb diff --git a/llvm/test/CodeGen/AMDGPU/split-smrd.ll b/llvm/test/CodeGen/AMDGPU/split-smrd.ll index 5fc69067760..6089492b1c2 100644 --- a/llvm/test/CodeGen/AMDGPU/split-smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/split-smrd.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: {{^}}split_smrd_add_worklist: ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { +define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(4)* byval %arg) #0 { bb: %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96) %tmp1 = bitcast float %tmp to i32 @@ -19,8 +19,8 @@ bb3: ; preds = %bb %tmp4 = bitcast float %tmp to i32 %tmp5 = add i32 %tmp4, 4 %tmp6 = sext i32 %tmp5 to i64 - %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6 - %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 + %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i64 0, i64 %tmp6 + %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0 %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float)>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp10 = extractelement <4 x float> %tmp9, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll index b5af3ead5e7..a40e6b2683e 100644 --- a/llvm/test/CodeGen/AMDGPU/store-global.ll +++ b/llvm/test/CodeGen/AMDGPU/store-global.ll @@ -394,11 +394,11 @@ entry: ; SIVI: buffer_store_dwordx2 ; GFX9: global_store_dwordx2 -define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 { entry: - %0 = load i32, i32 addrspace(2)* %mem, align 4 - %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 - %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 + %0 = load i32, i32 addrspace(4)* %mem, align 4 + %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1 + %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4 store i32 %0, i32 addrspace(1)* %out, align 4 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll index 617511a24b1..f9fc75023d4 100644 --- a/llvm/test/CodeGen/AMDGPU/store-private.ll +++ b/llvm/test/CodeGen/AMDGPU/store-private.ll @@ -689,11 +689,11 @@ entry: ; XSI: buffer_store_dwordx2 ; SI: buffer_store_dword ; SI: buffer_store_dword -define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +define amdgpu_kernel void @vecload2(i32 addrspace(5)* nocapture %out, i32 addrspace(4)* nocapture %mem) #0 { entry: - %0 = load i32, i32 addrspace(2)* %mem, align 4 - %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 - %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 + %0 = load i32, i32 addrspace(4)* %mem, align 4 + %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(4)* %mem, i64 1 + %1 = load i32, i32 addrspace(4)* %arrayidx1.i, align 4 store i32 %0, i32 addrspace(5)* %out, align 4 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %out, i64 1 store i32 %1, i32 addrspace(5)* %arrayidx1, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 998cfdf395c..fcd5aaf0616 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -27,9 +27,9 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; VI: s_sub_i32 ; VI: s_sub_i32 -define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { - %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 - %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 +define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 + %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -38,8 +38,8 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GCN-LABEL: {{^}}s_test_sub_self_v2i16: ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]] ; GCN: buffer_store_dword [[ZERO]] -define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { - %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 +define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %add = sub <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll index d0ddd2a0bc0..bf1525e6163 100644 --- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll +++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -15,10 +15,10 @@ declare void @llvm.amdgcn.s.dcache.wb() #0 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]] ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 define amdgpu_kernel void @target_none() #0 { - %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 - %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* - %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast + %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024 + %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)* + %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext @@ -31,10 +31,10 @@ define amdgpu_kernel void @target_none() #0 { ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]] ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 define amdgpu_kernel void @target_tahiti() #1 { - %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 - %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* - %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast + %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024 + %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)* + %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext @@ -47,10 +47,10 @@ define amdgpu_kernel void @target_tahiti() #1 { ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 ; CHECK: s_dcache_inv_vol define amdgpu_kernel void @target_bonaire() #3 { - %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 - %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* - %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast + %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024 + %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)* + %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext @@ -64,10 +64,10 @@ define amdgpu_kernel void @target_bonaire() #3 { ; CHECK: flat_store_dword ; CHECK: s_dcache_wb{{$}} define amdgpu_kernel void @target_fiji() #4 { - %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() - %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024 - %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)* - %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast + %kernargs = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %kernargs.gep = getelementptr inbounds i8, i8 addrspace(4)* %kernargs, i64 1024 + %kernargs.gep.cast = bitcast i8 addrspace(4)* %kernargs.gep to i32 addrspace(1)* addrspace(4)* + %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %kernargs.gep.cast %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll index a08535fc859..e5fe0bd9a9d 100644 --- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -418,8 +418,8 @@ define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspa ; UNALIGNED: s_load_dword ; SI: buffer_store_dword -define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { - %v = load i32, i32 addrspace(2)* %p, align 1 +define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 { + %v = load i32, i32 addrspace(4)* %p, align 1 store i32 %v, i32 addrspace(1)* %r, align 4 ret void } @@ -430,8 +430,8 @@ define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 ; UNALIGNED: s_load_dword ; UNALIGNED: buffer_store_dword -define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { - %v = load i32, i32 addrspace(2)* %p, align 2 +define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 { + %v = load i32, i32 addrspace(4)* %p, align 2 store i32 %v, i32 addrspace(1)* %r, align 4 ret void } @@ -444,8 +444,8 @@ define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 ad ; UNALIGNED: s_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2 -define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { - %v = load i64, i64 addrspace(2)* %p, align 2 +define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 { + %v = load i64, i64 addrspace(4)* %p, align 2 store i64 %v, i64 addrspace(1)* %r, align 4 ret void } @@ -453,8 +453,8 @@ define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 ad ; SI-LABEL: {{^}}constant_align4_load_i64: ; SI: s_load_dwordx2 ; SI: buffer_store_dwordx2 -define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 { - %v = load i64, i64 addrspace(2)* %p, align 4 +define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 { + %v = load i64, i64 addrspace(4)* %p, align 4 store i64 %v, i64 addrspace(1)* %r, align 4 ret void } @@ -462,8 +462,8 @@ define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 ad ; SI-LABEL: {{^}}constant_align4_load_v4i32: ; SI: s_load_dwordx4 ; SI: buffer_store_dwordx4 -define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { - %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4 +define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 { + %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 4 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 ret void } @@ -482,8 +482,8 @@ define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p ; UNALIGNED: buffer_load_dwordx2 ; SI: buffer_store_dwordx2 -define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 { - %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1 +define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)* %p, <2 x i32> addrspace(1)* %r) #0 { + %v = load <2 x i32>, <2 x i32> addrspace(4)* %p, align 1 store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4 ret void } @@ -512,8 +512,8 @@ define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* ; UNALIGNED: buffer_load_dwordx4 ; SI: buffer_store_dwordx4 -define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 { - %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1 +define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 { + %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 1 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 ret void } @@ -521,8 +521,8 @@ define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* ; SI-LABEL: {{^}}constant_align4_load_i8: ; SI: s_load_dword ; SI: buffer_store_byte -define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { - %v = load i8, i8 addrspace(2)* %p, align 4 +define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 { + %v = load i8, i8 addrspace(4)* %p, align 4 store i8 %v, i8 addrspace(1)* %r, align 4 ret void } @@ -530,8 +530,8 @@ define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrs ; SI-LABEL: {{^}}constant_align2_load_i8: ; SI: buffer_load_ubyte ; SI: buffer_store_byte -define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { - %v = load i8, i8 addrspace(2)* %p, align 2 +define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 { + %v = load i8, i8 addrspace(4)* %p, align 2 store i8 %v, i8 addrspace(1)* %r, align 2 ret void } @@ -541,10 +541,10 @@ define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrs ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 { - %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1 - %v0 = load i32, i32 addrspace(2)* %p, align 4 - %v1 = load i32, i32 addrspace(2)* %gep0, align 4 +define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 { + %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1 + %v0 = load i32, i32 addrspace(4)* %p, align 4 + %v1 = load i32, i32 addrspace(4)* %gep0, align 4 %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1 store i32 %v0, i32 addrspace(1)* %r, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll index 028199ef9de..32129509269 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll @@ -35,7 +35,7 @@ bb2: ; preds = %bb br label %bb3 bb3: ; preds = %bb3, %bb2 - %val = load volatile i32, i32 addrspace(2)* undef + %val = load volatile i32, i32 addrspace(4)* undef %tmp4 = icmp eq i32 %val, %arg1 br i1 %tmp4, label %bb5, label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll index 46a1c87184d..f002a1474e0 100644 --- a/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll +++ b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll @@ -36,11 +36,11 @@ define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #2 ; encoding: [0x40,0x02,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #2 ; encoding: [0x40,0x02,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 -@t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3] +@t = internal addrspace(4) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3] define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) { - %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in - %v = load i32, i32 addrspace(2)* %a + %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(4)* @t, i32 0, i32 %in + %v = load i32, i32 addrspace(4)* %a store i32 %v, i32 addrspace(1)* %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index 89327fb8f80..7cfde82de6c 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -27,15 +27,15 @@ ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1536 -define amdgpu_vs void @main([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <4 x i32>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { +define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* byval %arg, [17 x <4 x i32>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, [16 x <4 x i32>] addrspace(4)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { bb: - %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i64 0, i64 0 - %tmp11 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, align 16, !tbaa !0 + %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg1, i64 0, i64 0 + %tmp11 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, align 16, !tbaa !0 %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 0) %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 16) %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 32) - %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg4, i64 0, i64 0 - %tmp16 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp15, align 16, !tbaa !0 + %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(4)* %arg4, i64 0, i64 0 + %tmp16 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp15, align 16, !tbaa !0 %tmp17 = add i32 %arg5, %arg7 %tmp16.cast = bitcast <4 x i32> %tmp16 to <4 x i32> %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll index 623cbeae8da..b45874e1e4c 100644 --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -11,19 +11,19 @@ ; DEFAULT: exp ; DEFAULT: s_waitcnt lgkmcnt(0) ; DEFAULT: s_endpgm -define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { +define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 { main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 - %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 0 + %tmp10 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, !tbaa !0 %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32> %tmp11 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i1 false, i1 false) %tmp12 = extractelement <4 x float> %tmp11, i32 0 %tmp13 = extractelement <4 x float> %tmp11, i32 1 call void @llvm.amdgcn.s.barrier() #1 %tmp14 = extractelement <4 x float> %tmp11, i32 2 - %tmp15 = load float, float addrspace(2)* %constptr, align 4 - %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1 - %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0 + %tmp15 = load float, float addrspace(4)* %constptr, align 4 + %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(4)* %arg3, i32 1 + %tmp17 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp16, !tbaa !0 %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32> %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i1 false, i1 false) %tmp19 = extractelement <4 x float> %tmp18, i32 0 @@ -46,10 +46,10 @@ main_body: ; ILPMAX: exp pos0 ; ILPMAX-NEXT: exp param0 ; ILPMAX: s_endpgm -define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { +define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(4)* byval %arg, [17 x <16 x i8>] addrspace(4)* byval %arg1, [17 x <4 x i32>] addrspace(4)* byval %arg2, [34 x <8 x i32>] addrspace(4)* byval %arg3, [16 x <16 x i8>] addrspace(4)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { main_body: - %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0 - %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0 + %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(4)* %arg4, i64 0, i64 0 + %tmp11 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp, align 16, !tbaa !0 %tmp12 = add i32 %arg5, %arg7 %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32> %tmp13 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i1 false, i1 false) @@ -57,8 +57,8 @@ main_body: %tmp15 = extractelement <4 x float> %tmp13, i32 1 %tmp16 = extractelement <4 x float> %tmp13, i32 2 %tmp17 = extractelement <4 x float> %tmp13, i32 3 - %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 1 - %tmp19 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp18, align 16, !tbaa !0 + %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(4)* %arg4, i64 0, i64 1 + %tmp19 = load <16 x i8>, <16 x i8> addrspace(4)* %tmp18, align 16, !tbaa !0 %tmp20 = add i32 %arg5, %arg7 %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32> %tmp21 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll index f71a568e3e0..0af9bbcc25e 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll @@ -22,19 +22,19 @@ bb: br label %bb18 bb1: ; preds = %bb18 - %tmp = tail call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp3 = tail call i32 @llvm.amdgcn.workgroup.id.x() - %tmp4 = getelementptr inbounds i8, i8 addrspace(2)* %tmp, i64 4 - %tmp5 = bitcast i8 addrspace(2)* %tmp4 to i16 addrspace(2)* - %tmp6 = load i16, i16 addrspace(2)* %tmp5, align 4 + %tmp4 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4 + %tmp5 = bitcast i8 addrspace(4)* %tmp4 to i16 addrspace(4)* + %tmp6 = load i16, i16 addrspace(4)* %tmp5, align 4 %tmp7 = zext i16 %tmp6 to i32 %tmp8 = mul i32 %tmp3, %tmp7 %tmp9 = add i32 %tmp8, %tmp2 - %tmp10 = tail call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %tmp10 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %tmp11 = zext i32 %tmp9 to i64 - %tmp12 = bitcast i8 addrspace(2)* %tmp10 to i64 addrspace(2)* - %tmp13 = load i64, i64 addrspace(2)* %tmp12, align 8 + %tmp12 = bitcast i8 addrspace(4)* %tmp10 to i64 addrspace(4)* + %tmp13 = load i64, i64 addrspace(4)* %tmp12, align 8 %tmp14 = add i64 %tmp13, %tmp11 %tmp15 = zext i1 %tmp99 to i32 %tmp16 = and i64 %tmp14, 4294967295 @@ -131,7 +131,7 @@ bb18: ; preds = %bb18, %bb } ; Function Attrs: nounwind readnone speculatable -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 ; Function Attrs: nounwind readnone speculatable declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -140,7 +140,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 declare i32 @llvm.amdgcn.workgroup.id.x() #1 ; Function Attrs: nounwind readnone speculatable -declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1 +declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1 attributes #0 = { "target-cpu"="fiji" "target-features"="-flat-for-global" } attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll b/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll index e200f7b64d1..7c0dc6f58d8 100644 --- a/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll @@ -1,12 +1,12 @@ ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck -check-prefix=OPT %s -declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 ; OPT-LABEL: @constant_load_i1 ; OPT: load i1 ; OPT-NEXT: store i1 -define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { - %val = load i1, i1 addrspace(2)* %in +define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 { + %val = load i1, i1 addrspace(4)* %in store i1 %val, i1 addrspace(1)* %out ret void } @@ -14,8 +14,8 @@ define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace( ; OPT-LABEL: @constant_load_i1_align2 ; OPT: load i1 ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { - %val = load i1, i1 addrspace(2)* %in, align 2 +define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 { + %val = load i1, i1 addrspace(4)* %in, align 2 store i1 %val, i1 addrspace(1)* %out, align 2 ret void } @@ -25,8 +25,8 @@ define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 add ; OPT-NEXT: load i32 ; OPT-NEXT: trunc ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { - %val = load i1, i1 addrspace(2)* %in, align 4 +define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(4)* %in) #0 { + %val = load i1, i1 addrspace(4)* %in, align 4 store i1 %val, i1 addrspace(1)* %out, align 4 ret void } @@ -34,8 +34,8 @@ define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 add ; OPT-LABEL: @constant_load_i8 ; OPT: load i8 ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %val = load i8, i8 addrspace(2)* %in +define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %val = load i8, i8 addrspace(4)* %in store i8 %val, i8 addrspace(1)* %out ret void } @@ -43,8 +43,8 @@ define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace( ; OPT-LABEL: @constant_load_i8_align2 ; OPT: load i8 ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %val = load i8, i8 addrspace(2)* %in, align 2 +define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %val = load i8, i8 addrspace(4)* %in, align 2 store i8 %val, i8 addrspace(1)* %out, align 2 ret void } @@ -54,8 +54,8 @@ define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 add ; OPT-NEXT: load i32 ; OPT-NEXT: trunc ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { - %val = load i8, i8 addrspace(2)* %in, align 4 +define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(4)* %in) #0 { + %val = load i8, i8 addrspace(4)* %in, align 4 store i8 %val, i8 addrspace(1)* %out, align 4 ret void } @@ -64,8 +64,8 @@ define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addr ; OPT-LABEL: @constant_load_v2i8 ; OPT: load <2 x i8> ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in store <2 x i8> %ld, <2 x i8> addrspace(1)* %out ret void } @@ -76,32 +76,32 @@ define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x ; OPT-NEXT: trunc ; OPT-NEXT: bitcast ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in, align 4 +define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(4)* %in, align 4 store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4 ret void } ; OPT-LABEL: @constant_load_v3i8 ; OPT: bitcast <3 x i8> -; OPT-NEXT: load i32, i32 addrspace(2) +; OPT-NEXT: load i32, i32 addrspace(4) ; OPT-NEXT: trunc i32 ; OPT-NEXT: bitcast i24 ; OPT-NEXT: store <3 x i8> -define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { - %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 { + %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in store <3 x i8> %ld, <3 x i8> addrspace(1)* %out ret void } ; OPT-LABEL: @constant_load_v3i8_align4 ; OPT: bitcast <3 x i8> -; OPT-NEXT: load i32, i32 addrspace(2) +; OPT-NEXT: load i32, i32 addrspace(4) ; OPT-NEXT: trunc i32 ; OPT-NEXT: bitcast i24 ; OPT-NEXT: store <3 x i8> -define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { - %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in, align 4 +define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(4)* %in) #0 { + %ld = load <3 x i8>, <3 x i8> addrspace(4)* %in, align 4 store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4 ret void } @@ -110,8 +110,8 @@ define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out ; OPT: load i16 ; OPT: sext ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { - %ld = load i16, i16 addrspace(2)* %in +define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %ld = load i16, i16 addrspace(4)* %in %ext = sext i16 %ld to i32 store i32 %ext, i32 addrspace(1)* %out ret void @@ -123,8 +123,8 @@ define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspa ; OPT-NEXT: trunc ; OPT-NEXT: sext ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { - %ld = load i16, i16 addrspace(2)* %in, align 4 +define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { + %ld = load i16, i16 addrspace(4)* %in, align 4 %ext = sext i16 %ld to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 ret void @@ -133,8 +133,8 @@ define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 ; OPT-LABEL: @constant_load_f16 ; OPT: load half ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(2)* %in) #0 { - %ld = load half, half addrspace(2)* %in +define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(4)* %in) #0 { + %ld = load half, half addrspace(4)* %in store half %ld, half addrspace(1)* %out ret void } @@ -142,8 +142,8 @@ define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrs ; OPT-LABEL: @constant_load_v2f16 ; OPT: load <2 x half> ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %in) #0 { - %ld = load <2 x half>, <2 x half> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %in) #0 { + %ld = load <2 x half>, <2 x half> addrspace(4)* %in store <2 x half> %ld, <2 x half> addrspace(1)* %out ret void } @@ -151,8 +151,8 @@ define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 ; OPT-LABEL: @load_volatile ; OPT: load volatile i16 ; OPT-NEXT: store -define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { - %a = load volatile i16, i16 addrspace(2)* %in +define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(4)* %in) { + %a = load volatile i16, i16 addrspace(4)* %in store i16 %a, i16 addrspace(1)* %out ret void } @@ -160,8 +160,8 @@ define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2 ; OPT-LABEL: @constant_load_v2i8_volatile ; OPT: load volatile <2 x i8> ; OPT-NEXT: store -define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { - %ld = load volatile <2 x i8>, <2 x i8> addrspace(2)* %in +define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(4)* %in) #0 { + %ld = load volatile <2 x i8>, <2 x i8> addrspace(4)* %in store <2 x i8> %ld, <2 x i8> addrspace(1)* %out ret void } @@ -182,8 +182,8 @@ define amdgpu_kernel void @constant_load_v2i8_addrspace1(<2 x i8> addrspace(1)* ; OPT-NEXT: zext ; OPT-NEXT: store define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { - %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - %val = load i8, i8 addrspace(2)* %dispatch.ptr, align 4 + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %val = load i8, i8 addrspace(4)* %dispatch.ptr, align 4 %ld = zext i8 %val to i32 store i32 %ld, i32 addrspace(1)* %ptr ret void diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll index 43fe18f1aa2..39a22ad74a1 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll @@ -2,64 +2,64 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -@array = internal addrspace(2) constant [4096 x [32 x float]] zeroinitializer, align 4 +@array = internal addrspace(4) constant [4096 x [32 x float]] zeroinitializer, align 4 ; IR-LABEL: @sum_of_array( -; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1 -; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32 -; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33 +; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} +; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 1 +; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 32 +; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 33 define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { %tmp = sext i32 %y to i64 %tmp1 = sext i32 %x to i64 - %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp - %tmp4 = load float, float addrspace(2)* %tmp2, align 4 + %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp + %tmp4 = load float, float addrspace(4)* %tmp2, align 4 %tmp5 = fadd float %tmp4, 0.000000e+00 %tmp6 = add i32 %y, 1 %tmp7 = sext i32 %tmp6 to i64 - %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp7 - %tmp10 = load float, float addrspace(2)* %tmp8, align 4 + %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp7 + %tmp10 = load float, float addrspace(4)* %tmp8, align 4 %tmp11 = fadd float %tmp5, %tmp10 %tmp12 = add i32 %x, 1 %tmp13 = sext i32 %tmp12 to i64 - %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp - %tmp16 = load float, float addrspace(2)* %tmp14, align 4 + %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp + %tmp16 = load float, float addrspace(4)* %tmp14, align 4 %tmp17 = fadd float %tmp11, %tmp16 - %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp13, i64 %tmp7 - %tmp20 = load float, float addrspace(2)* %tmp18, align 4 + %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp7 + %tmp20 = load float, float addrspace(4)* %tmp18, align 4 %tmp21 = fadd float %tmp17, %tmp20 store float %tmp21, float addrspace(1)* %output, align 4 ret void } -@array2 = internal addrspace(2) constant [4096 x [4 x float]] zeroinitializer, align 4 +@array2 = internal addrspace(4) constant [4096 x [4 x float]] zeroinitializer, align 4 ; Some of the indices go over the maximum mubuf offset, so don't split them. ; IR-LABEL: @sum_of_array_over_max_mubuf_offset( -; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 255 +; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} +; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 255 ; IR: add i32 %x, 256 -; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} -; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} +; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} +; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { %tmp = sext i32 %y to i64 %tmp1 = sext i32 %x to i64 - %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp - %tmp4 = load float, float addrspace(2)* %tmp2, align 4 + %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp1, i64 %tmp + %tmp4 = load float, float addrspace(4)* %tmp2, align 4 %tmp5 = fadd float %tmp4, 0.000000e+00 %tmp6 = add i32 %y, 255 %tmp7 = sext i32 %tmp6 to i64 - %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp7 - %tmp10 = load float, float addrspace(2)* %tmp8, align 4 + %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp1, i64 %tmp7 + %tmp10 = load float, float addrspace(4)* %tmp8, align 4 %tmp11 = fadd float %tmp5, %tmp10 %tmp12 = add i32 %x, 256 %tmp13 = sext i32 %tmp12 to i64 - %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp - %tmp16 = load float, float addrspace(2)* %tmp14, align 4 + %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp13, i64 %tmp + %tmp16 = load float, float addrspace(4)* %tmp14, align 4 %tmp17 = fadd float %tmp11, %tmp16 - %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp13, i64 %tmp7 - %tmp20 = load float, float addrspace(2)* %tmp18, align 4 + %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp13, i64 %tmp7 + %tmp20 = load float, float addrspace(4)* %tmp18, align 4 %tmp21 = fadd float %tmp17, %tmp20 store float %tmp21, float addrspace(1)* %output, align 4 ret void @@ -97,18 +97,18 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y ; IR: getelementptr {{.*}} !amdgpu.uniform ; IR: getelementptr {{.*}} !amdgpu.uniform ; IR: getelementptr {{.*}} !amdgpu.uniform -define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata([0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { +define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata([0 x <4 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { main_body: %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8 %23 = bitcast float %22 to i32 %24 = shl i32 %23, 1 - %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(2)* %1, i32 0, i32 %24, !amdgpu.uniform !0 - %26 = load <8 x i32>, <8 x i32> addrspace(2)* %25, align 32, !invariant.load !0 + %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(4)* %1, i32 0, i32 %24, !amdgpu.uniform !0 + %26 = load <8 x i32>, <8 x i32> addrspace(4)* %25, align 32, !invariant.load !0 %27 = shl i32 %23, 2 %28 = or i32 %27, 3 - %29 = bitcast [0 x <8 x i32>] addrspace(2)* %1 to [0 x <4 x i32>] addrspace(2)* - %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %29, i32 0, i32 %28, !amdgpu.uniform !0 - %31 = load <4 x i32>, <4 x i32> addrspace(2)* %30, align 16, !invariant.load !0 + %29 = bitcast [0 x <8 x i32>] addrspace(4)* %1 to [0 x <4 x i32>] addrspace(4)* + %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(4)* %29, i32 0, i32 %28, !amdgpu.uniform !0 + %31 = load <4 x i32>, <4 x i32> addrspace(4)* %30, align 16, !invariant.load !0 %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8 %33 = extractelement <4 x float> %32, i32 0 %34 = extractelement <4 x float> %32, i32 1 |