summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Support/MathExtras.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp125
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h1
-rw-r--r--llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll41
9 files changed, 214 insertions, 63 deletions
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 66024123133..12a4334d563 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -621,6 +621,13 @@ inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
return (Value + Align - 1 - Skew) / Align * Align + Skew;
}
+/// Returns the largest uint64_t less than or equal to \p Value and is
+/// \p Skew mod \p Align. \p Align must be non-zero
+inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+ Skew %= Align;
+ return (Value - Skew) / Align * Align + Skew;
+}
+
/// Returns the offset to the next integer (mod 2**64) that is greater than
/// or equal to \p Value and is a multiple of \p Align. \p Align must be
/// non-zero.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 321323fb21c..007321d4704 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -496,10 +496,12 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
- // FIXME: This is the maximum work group size. We should try to get
- // value from the reqd_work_group_size function attribute if it is
- // available.
- unsigned WorkGroupSize = 256;
+ const Function &ContainingFunction = *I.getParent()->getParent();
+
+ // FIXME: We should also try to get this value from the reqd_work_group_size
+ // function attribute if it is available.
+ unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
+
int AllocaSize =
WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
@@ -520,7 +522,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
Function *F = I.getParent()->getParent();
- Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
+ Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
GlobalVariable *GV = new GlobalVariable(
*Mod, GVTy, false, GlobalValue::InternalLinkage,
UndefValue::get(GVTy),
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ef413cb1d1a..43f4b9f7dde 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,6 +48,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
PSInputAddr(0),
ReturnsVoid(true),
+ MaximumWorkGroupSize(0),
LDSWaveSpillSize(0),
PSInputEna(0),
NumUserSGPRs(0),
@@ -123,6 +124,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
ST.isAmdHsaOS())
FlatScratchInit = true;
+
+ if (AMDGPU::isCompute(F->getCallingConv()))
+ MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
+ else
+ MaximumWorkGroupSize = ST.getWavefrontSize();
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -202,10 +208,5 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
const MachineFunction &MF) const {
- const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
- // FIXME: We should get this information from kernel attributes if it
- // is available.
- if (AMDGPU::isCompute(MF.getFunction()->getCallingConv()))
- return 256;
- return ST.getWavefrontSize();
+ return MaximumWorkGroupSize;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ac3497c31d1..acbd276848f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -60,6 +60,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
unsigned PSInputAddr;
bool ReturnsVoid;
+ unsigned MaximumWorkGroupSize;
+
public:
// FIXME: Make private
unsigned LDSWaveSpillSize;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7090a582a98..7edd41832ec 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -23,6 +23,53 @@
using namespace llvm;
+static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
+ const SIMachineFunctionInfo& MFI = *MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ unsigned SIMDPerCU = 4;
+
+ unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
+ return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
+ MaxInvocationsPerWave;
+}
+
+static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+
+ unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
+ unsigned ReservedSGPRCount;
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ TotalSGPRCountPerSIMD = 800;
+ AddressableSGPRCount = 102;
+ SGPRUsageAlignment = 16;
+ ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
+ } else {
+ TotalSGPRCountPerSIMD = 512;
+ AddressableSGPRCount = 104;
+ SGPRUsageAlignment = 8;
+ ReservedSGPRCount = 2; // VCC
+ }
+
+ unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
+ MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
+
+ if (ST.hasSGPRInitBug())
+ MaxSGPRCount = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+
+ return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
+}
+
+static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
+ unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
+ unsigned TotalVGPRCountPerSIMD = 256;
+ unsigned VGPRUsageAlignment = 4;
+
+ return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
+ VGPRUsageAlignment);
+}
+
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
for (unsigned i = 0; PSets[i] != -1; ++i) {
if (PSets[i] == (int)PSetID)
@@ -71,38 +118,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
- const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
- if (ST.hasSGPRInitBug()) {
- // Leave space for flat_scr, xnack_mask, vcc, and alignment
- unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
- unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
- return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
- }
-
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
- // 100/101 for vcc. This is the next sgpr128 down.
- return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
- }
-
- return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+ unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
+ unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
- const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
- if (ST.hasSGPRInitBug()) {
- unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
- return AMDGPU::SGPR_32RegClass.getRegister(Idx);
- }
-
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // Next register before reservations for flat_scr, xnack_mask, vcc,
- // and scratch resource.
- return AMDGPU::SGPR91;
+ unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
+ unsigned Reg;
+
+ // Try to place it in a hole after PrivateSegmentbufferReg.
+ if (RegCount & 3) {
+ // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
+ // alignment constraints, so we have a hole where can put the wave offset.
+ Reg = RegCount - 1;
+ } else {
+ // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
+ // wave offset before it.
+ Reg = RegCount - 5;
}
-
- return AMDGPU::SGPR95;
+ return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -124,35 +160,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
- // Reserve the last 2 registers so we will always have at least 2 more that
- // will physically contain VCC.
- reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
+ unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
+ unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
- const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
-
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
- // for VCC/XNACK_MASK/FLAT_SCR.
- //
- // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
- // SGPRs when the XNACK feature is not used. This is currently not done
- // because the code that counts SGPRs cannot account for such holes.
- reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
- reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
- reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+ unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+ unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+ for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
+ unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
}
- // Tonga and Iceland can only allocate a fixed number of SGPRs due
- // to a hw bug.
- if (ST.hasSGPRInitBug()) {
- unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
- // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
- unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
- for (unsigned i = Limit; i < NumSGPRs; ++i) {
- unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
- reserveRegisterTuples(Reserved, Reg);
- }
+ for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5e3498f86d6..52f764876f3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -124,6 +124,10 @@ static unsigned getIntegerAttribute(const Function &F, const char *Name,
return Result;
}
+unsigned getMaximumWorkGroupSize(const Function &F) {
+ return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
+}
+
unsigned getInitialPSInputAddr(const Function &F) {
return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d229dec8036..4feb57c9e34 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -45,6 +45,7 @@ bool isGroupSegment(const GlobalValue *GV);
bool isGlobalSegment(const GlobalValue *GV);
bool isReadOnlySegment(const GlobalValue *GV);
+unsigned getMaximumWorkGroupSize(const Function &F);
unsigned getInitialPSInputAddr(const Function &F);
bool isShader(CallingConv::ID cc);
diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
new file mode 100644
index 00000000000..fefca0f0d4e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
+
+; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+
+define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %0 = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+ store i32 4, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+ %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+ store i32 5, i32* %arrayidx3, align 4
+ %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+ %2 = load i32, i32* %arrayidx10, align 4
+ store i32 %2, i32 addrspace(1)* %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+ %3 = load i32, i32* %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+ store i32 %3, i32 addrspace(1)* %arrayidx13
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
+attributes #1 = { nounwind "amdgpu-max-work-group-size"="256" }
+attributes #2 = { nounwind "amdgpu-max-work-group-size"="1600" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll
new file mode 100644
index 00000000000..8a2fcb70cb6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-work-group-registers.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; CHECK: NumVgprs: 63
+define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
+main_body:
+ %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
+ %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
+ %10 = extractelement <3 x i32> %7, i32 0
+ %11 = extractelement <3 x i32> %7, i32 1
+ %12 = mul i32 %10, %11
+ %bc = bitcast <3 x i32> %7 to <3 x float>
+ %13 = extractelement <3 x float> %bc, i32 1
+ %14 = insertelement <512 x float> undef, float %13, i32 %12
+ call void @llvm.amdgcn.s.barrier()
+ %15 = extractelement <3 x i32> %6, i32 0
+ %16 = extractelement <3 x i32> %7, i32 0
+ %17 = shl i32 %15, 5
+ %18 = add i32 %17, %16
+ %19 = shl i32 %18, 4
+ %20 = extractelement <3 x i32> %7, i32 1
+ %21 = shl i32 %20, 2
+ %22 = sext i32 %21 to i64
+ %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
+ %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
+ %25 = load i32, i32 addrspace(3)* %24, align 4
+ %26 = extractelement <512 x float> %14, i32 %25
+ %27 = insertelement <4 x float> undef, float %26, i32 0
+ call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+
+declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
+
+attributes #0 = { "amdgpu-max-work-group-size"="1024" }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
OpenPOWER on IntegriCloud