diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2016-04-14 16:27:07 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2016-04-14 16:27:07 +0000 |
commit | 79a1fd718c71b4480dc9f00e8e77f4408ec9e6fa (patch) | |
tree | d7ad0eed911b428e15f871225d742595de9420b8 | |
parent | f110f8f9f7f46c668e03f4808e03aa54c2157269 (diff) | |
download | bcm5719-llvm-79a1fd718c71b4480dc9f00e8e77f4408ec9e6fa.tar.gz bcm5719-llvm-79a1fd718c71b4480dc9f00e8e77f4408ec9e6fa.zip |
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
-rw-r--r-- | llvm/include/llvm/Support/MathExtras.h | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 125 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll | 72 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll | 41 |
9 files changed, 214 insertions, 63 deletions
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h index 66024123133..12a4334d563 100644 --- a/llvm/include/llvm/Support/MathExtras.h +++ b/llvm/include/llvm/Support/MathExtras.h @@ -621,6 +621,13 @@ inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { return (Value + Align - 1 - Skew) / Align * Align + Skew; } +/// Returns the largest uint64_t less than or equal to \p Value and is +/// \p Skew mod \p Align. \p Align must be non-zero +inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { + Skew %= Align; + return (Value - Skew) / Align * Align + Skew; +} + /// Returns the offset to the next integer (mod 2**64) that is greater than /// or equal to \p Value and is a multiple of \p Align. \p Align must be /// non-zero. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 321323fb21c..007321d4704 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -496,10 +496,12 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - // FIXME: This is the maximum work group size. We should try to get - // value from the reqd_work_group_size function attribute if it is - // available. - unsigned WorkGroupSize = 256; + const Function &ContainingFunction = *I.getParent()->getParent(); + + // FIXME: We should also try to get this value from the reqd_work_group_size + // function attribute if it is available. + unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); + int AllocaSize = WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); @@ -520,7 +522,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { Function *F = I.getParent()->getParent(); - Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); + Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, UndefValue::get(GVTy), diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index ef413cb1d1a..43f4b9f7dde 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -48,6 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), ReturnsVoid(true), + MaximumWorkGroupSize(0), LDSWaveSpillSize(0), PSInputEna(0), NumUserSGPRs(0), @@ -123,6 +124,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS && ST.isAmdHsaOS()) FlatScratchInit = true; + + if (AMDGPU::isCompute(F->getCallingConv())) + MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F); + else + MaximumWorkGroupSize = ST.getWavefrontSize(); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -202,10 +208,5 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - // FIXME: We should get this information from kernel attributes if it - // is available. - if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) - return 256; - return ST.getWavefrontSize(); + return MaximumWorkGroupSize; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ac3497c31d1..acbd276848f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -60,6 +60,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned PSInputAddr; bool ReturnsVoid; + unsigned MaximumWorkGroupSize; + public: // FIXME: Make private unsigned LDSWaveSpillSize; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 7090a582a98..7edd41832ec 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -23,6 +23,53 @@ using namespace llvm; +static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) { + const SIMachineFunctionInfo& MFI = *MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + unsigned SIMDPerCU = 4; + + unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize(); + return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) / + MaxInvocationsPerWave; +} + +static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + + unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment; + unsigned ReservedSGPRCount; + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + TotalSGPRCountPerSIMD = 800; + AddressableSGPRCount = 102; + SGPRUsageAlignment = 16; + ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK + } else { + TotalSGPRCountPerSIMD = 512; + AddressableSGPRCount = 104; + SGPRUsageAlignment = 8; + ReservedSGPRCount = 2; // VCC + } + + unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD); + MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment); + + if (ST.hasSGPRInitBug()) + MaxSGPRCount = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + + return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount); +} + +static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) { + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + unsigned TotalVGPRCountPerSIMD = 256; + unsigned VGPRUsageAlignment = 4; + + return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD, + VGPRUsageAlignment); +} + static bool hasPressureSet(const int *PSets, unsigned PSetID) { for (unsigned i = 0; PSets[i] != -1; ++i) { if (PSets[i] == (int)PSetID) @@ -71,38 +118,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - if (ST.hasSGPRInitBug()) { - // Leave space for flat_scr, xnack_mask, vcc, and alignment - unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4; - unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and - // 100/101 for vcc. This is the next sgpr128 down. - return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; - } - - return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; + unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - if (ST.hasSGPRInitBug()) { - unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; - return AMDGPU::SGPR_32RegClass.getRegister(Idx); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Next register before reservations for flat_scr, xnack_mask, vcc, - // and scratch resource. - return AMDGPU::SGPR91; + unsigned RegCount = getMaxWorkGroupSGPRCount(MF); + unsigned Reg; + + // Try to place it in a hole after PrivateSegmentbufferReg. + if (RegCount & 3) { + // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to + // alignment constraints, so we have a hole where can put the wave offset. + Reg = RegCount - 1; + } else { + // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the + // wave offset before it. + Reg = RegCount - 5; } - - return AMDGPU::SGPR95; + return AMDGPU::SGPR_32RegClass.getRegister(Reg); } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -124,35 +160,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); - // Reserve the last 2 registers so we will always have at least 2 more that - // will physically contain VCC. - reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); + unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF); + unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF); - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation - // for VCC/XNACK_MASK/FLAT_SCR. - // - // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose - // SGPRs when the XNACK feature is not used. This is currently not done - // because the code that counts SGPRs cannot account for such holes. - reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); - reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); - reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } - // Tonga and Iceland can only allocate a fixed number of SGPRs due - // to a hw bug. - if (ST.hasSGPRInitBug()) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs). - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6; - for (unsigned i = Limit; i < NumSGPRs; ++i) { - unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } + for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5e3498f86d6..52f764876f3 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -124,6 +124,10 @@ static unsigned getIntegerAttribute(const Function &F, const char *Name, return Result; } +unsigned getMaximumWorkGroupSize(const Function &F) { + return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256); +} + unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index d229dec8036..4feb57c9e34 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -45,6 +45,7 @@ bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); +unsigned getMaximumWorkGroupSize(const Function &F); unsigned getInitialPSInputAddr(const Function &F); bool isShader(CallingConv::ID cc); diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll new file mode 100644 index 00000000000..fefca0f0d4e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -0,0 +1,72 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s + +; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 + +define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 + +define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 + +define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" } +attributes #1 = { nounwind "amdgpu-max-work-group-size"="256" } +attributes #2 = { nounwind "amdgpu-max-work-group-size"="1600" } + diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll new file mode 100644 index 00000000000..8a2fcb70cb6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; CHECK: NumVgprs: 63 +define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 { +main_body: + %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8 + %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0 + %10 = extractelement <3 x i32> %7, i32 0 + %11 = extractelement <3 x i32> %7, i32 1 + %12 = mul i32 %10, %11 + %bc = bitcast <3 x i32> %7 to <3 x float> + %13 = extractelement <3 x float> %bc, i32 1 + %14 = insertelement <512 x float> undef, float %13, i32 %12 + call void @llvm.amdgcn.s.barrier() + %15 = extractelement <3 x i32> %6, i32 0 + %16 = extractelement <3 x i32> %7, i32 0 + %17 = shl i32 %15, 5 + %18 = add i32 %17, %16 + %19 = shl i32 %18, 4 + %20 = extractelement <3 x i32> %7, i32 1 + %21 = shl i32 %20, 2 + %22 = sext i32 %21 to i64 + %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22 + %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)* + %25 = load i32, i32 addrspace(3)* %24, align 4 + %26 = extractelement <512 x float> %14, i32 %25 + %27 = insertelement <4 x float> undef, float %26, i32 0 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false) + ret void +} + +declare void @llvm.amdgcn.s.barrier() #1 + +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 + +attributes #0 = { "amdgpu-max-work-group-size"="1024" } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind } + +!0 = !{!1, !1, i64 0, i32 1} +!1 = !{!"const", null} |