diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 85 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/spill-m0.ll | 29 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll | 176 |
4 files changed, 259 insertions, 53 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 936bb8b656f..3c96cc2d7ca 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -513,6 +513,22 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, } } +static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize, + bool Store) { + if (SuperRegSize % 16 == 0) { + return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR }; + } + + if (SuperRegSize % 8 == 0) { + return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR }; + } + + return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR : + AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; +} + void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { @@ -522,7 +538,6 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); @@ -534,7 +549,6 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); - const unsigned EltSize = 4; unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; @@ -546,14 +560,40 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, } } + unsigned ScalarStoreOp; + unsigned EltSize = 4; + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + if (SpillToSMEM && isSGPRClass(RC)) { + // XXX - if private_element_size is larger than 4 it might be useful to be + // able to spill wider vmem spills. + std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); + } + + const TargetRegisterClass *SubRC = nullptr; + unsigned NumSubRegs = 1; + ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); + + if (!SplitParts.empty()) { + NumSubRegs = SplitParts.size(); + SubRC = getSubRegClass(RC, SplitParts[0]); + } + // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToSMEM) { int64_t FrOffset = FrameInfo.getObjectOffset(Index); + + // The allocated memory size is really the wavefront size * the frame + // index size. The widest register class is 64 bytes, so a 4-byte scratch + // allocation is enough to spill this in a single stack object. + // + // FIXME: Frame size/offsets are computed earlier than this, so the extra + // space is still unnecessarily allocated. + unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); @@ -561,12 +601,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); - // Add i * 4 wave offset. - // // SMEM instructions only support a single offset, so increment the wave // offset. - int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) @@ -576,7 +614,7 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, .addReg(MFI->getScratchWaveOffsetReg()); } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR)) + BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) .addReg(SubReg, getKillRegState(IsKill)) // sdata .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff @@ -656,7 +694,6 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; @@ -673,16 +710,34 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, } } + unsigned EltSize = 4; + unsigned ScalarLoadOp; + + const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + if (SpillToSMEM && isSGPRClass(RC)) { + // XXX - if private_element_size is larger than 4 it might be useful to be + // able to spill wider vmem spills. + std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); + } + + const TargetRegisterClass *SubRC = nullptr; + unsigned NumSubRegs = 1; + ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); + + if (!SplitParts.empty()) { + NumSubRegs = SplitParts.size(); + SubRC = getSubRegClass(RC, SplitParts[0]); + } + // SubReg carries the "Kill" flag when SubReg == SuperReg. int64_t FrOffset = FrameInfo.getObjectOffset(Index); - const unsigned EltSize = 4; - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? - SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); + SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToSMEM) { + // FIXME: Size may be > 4 but extra bytes wasted. unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); @@ -691,7 +746,7 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, EltSize, MinAlign(Align, EltSize * i)); // Add i * 4 offset - int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); + int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) @@ -702,14 +757,14 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, } auto MIB = - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg) + BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc .addMemOperand(MMO); if (NumSubRegs > 1) - MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + MIB.addReg(SuperReg, RegState::ImplicitDefine); continue; } @@ -725,7 +780,7 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addImm(Spill.Lane); if (NumSubRegs > 1) - MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + MIB.addReg(SuperReg, RegState::ImplicitDefine); } else { // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index ed397f5009c..5d03094bb4f 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -13,31 +13,15 @@ ; SGPR-NEXT: s_nop 4 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 - ; Make sure scratch wave offset register is correctly incremented and ; then restored. ; SMEM: s_mov_b32 m0, s91{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill -; SMEM: s_add_u32 m0, s91, 0x100{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill -; SMEM: s_add_u32 m0, s91, 0x200{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill -; SMEM: s_add_u32 m0, s91, 0x300{{$}} -; SMEM: s_buffer_store_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Spill - +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[92:95], m0 ; 16-byte Folded Spill ; SMEM: s_mov_b32 m0, s91{{$}} -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload -; SMEM: s_add_u32 m0, s91, 0x100{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload -; SMEM: s_add_u32 m0, s91, 0x200{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload -; SMEM: s_add_u32 m0, s91, 0x300{{$}} -; SMEM: s_waitcnt lgkmcnt(0) -; SMEM: s_buffer_load_dword s{{[0-9]+}}, s[92:95], m0 ; 4-byte Folded Reload +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[92:95], m0 ; 16-byte Folded Reload +; SMEM: s_dcache_wb ; ALL: s_endpgm define void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index 83ab44d191c..24b0a81c18e 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -72,9 +72,7 @@ endif: ; TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s7, 0x100 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM: s_add_u32 m0, s7, 0x200 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b64 exec, @@ -83,7 +81,7 @@ endif: ; TOSMEM: BB{{[0-9]+_[0-9]+}}: ; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100 -; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload +; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 @@ -124,9 +122,7 @@ endif: ; preds = %else, %if ; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill +; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_mov_b64 exec, @@ -135,13 +131,7 @@ endif: ; preds = %else, %if ; TOSMEM: BB{{[0-9]+_[0-9]+}}: ; TOSMEM-NEXT: s_mov_b32 m0, s3 -; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload -; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100 - -; FIXME: Could delay this wait -; TOSMEM-NEXT: s_waitcnt lgkmcnt(0) -; TOSMEM-NEXT: s_buffer_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Reload - +; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload ; GCN-NOT: v_readlane_b32 m0 ; GCN-NOT: s_buffer_store_dword m0 @@ -167,10 +157,14 @@ endif: } ; GCN-LABEL: {{^}}restore_m0_lds: +; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]] ; TOSMEM: s_cmp_eq_u32 ; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM: s_buffer_store_dword s4, s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[84:87], m0 ; 8-byte Folded Spill +; TOSMEM-NOT: m0 +; TOSMEM: s_add_u32 m0, s3, 0x200 +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[84:87], m0 ; 4-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 @@ -178,10 +172,7 @@ endif: ; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM: s_buffer_load_dword s4, s[84:87], m0 ; 4-byte Folded Reload -; TOSMEM: s_add_u32 m0, s3, 0x100 -; TOSMEM: s_waitcnt lgkmcnt(0) -; TOSMEM: s_buffer_load_dword s5, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[84:87], m0 ; 8-byte Folded Reload ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll new file mode 100644 index 00000000000..cab45be8da5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -0,0 +1,176 @@ +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR %s +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s +; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s + +; ALL-LABEL: {{^}}spill_sgpr_x2: +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill +; SMEM: s_cbranch_scc1 + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload + +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; FIXME: Should only need 4 bytes +; SMEM: ScratchSize: 12 + + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x2(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + +; ALL-LABEL: {{^}}spill_sgpr_x4: +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill +; SMEM: s_cbranch_scc1 + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; FIXME: Should only need 4 bytes +; SMEM: ScratchSize: 20 + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 + + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x4(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <4 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<4 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + +; ALL-LABEL: {{^}}spill_sgpr_x8: + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill +; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill +; SMEM: s_cbranch_scc1 + +; SMEM: s_mov_b32 m0, s3{{$}} +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload +; SMEM: s_add_u32 m0, s3, 16 +; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload + +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; SMEM: ScratchSize: 36 + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 3 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 4 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 5 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 6 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7 + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x8(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + +; FIXME: x16 inlineasm seems broken +; define amdgpu_kernel void @spill_sgpr_x16(i32 addrspace(1)* %out, i32 %in) #0 { +; %wide.sgpr = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 +; %cmp = icmp eq i32 %in, 0 +; br i1 %cmp, label %bb0, label %ret + +; bb0: +; call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr) #0 +; br label %ret + +; ret: +; ret void +; } + +attributes #0 = { nounwind } |