diff options
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 26 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CIInstructions.td | 60 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VIInstructions.td | 55 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/hsa.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/register-count-comments.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll | 18 |
13 files changed, 131 insertions, 82 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 68b50504ee4..5913bf7f773 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -108,6 +108,11 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol "true", "Force using DS instruction immediate offsets on SI">; +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ea7c6429b7d..b33040b4d06 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -95,7 +95,7 @@ private: bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; - void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -920,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { return isUInt<12>(Imm->getZExtValue()); } -void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const { + // Subtarget prefers to use flat instruction + if (Subtarget->useFlatForGlobal()) + return false; + SDLoc DL(Addr); GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -958,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, if (isLegalMUBUFImmOffset(C1)) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return; + return true; } else if (isUInt<32>(C1->getZExtValue())) { // Illegal offset, store it in soffset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 0); - return; + return true; } } @@ -977,13 +981,15 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N0; VAddr = N1; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return; + return true; } // default case -> offset VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); Ptr = Addr; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + + return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, @@ -996,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return false; - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); if (C->getSExtValue()) { @@ -1063,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && !cast<ConstantSDNode>(Idxen)->getSExtValue() && diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0aee09bab59..44e0c47877a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -45,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // disable it. SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. + FullFS += "+flat-for-global,"; FullFS += FS; if (GPU == "" && TT.getArch() == Triple::amdgcn) @@ -68,9 +70,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), - EnableUnsafeDSOffsetFolding(false), + CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), + EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 971b5179b13..9c7bb88f8f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -70,6 +70,7 @@ private: bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; + bool FlatForGlobal; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; @@ -159,6 +160,10 @@ public: return FlatAddressSpace; } + bool useFlatForGlobal() const { + return FlatForGlobal; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } diff --git a/llvm/lib/Target/AMDGPU/CIInstructions.td b/llvm/lib/Target/AMDGPU/CIInstructions.td index 7b8cb125dad..afbd155fee5 100644 --- a/llvm/lib/Target/AMDGPU/CIInstructions.td +++ b/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -234,3 +234,63 @@ def : Pat < >; } // End Predicates = [isCI] + + +//===----------------------------------------------------------------------===// +// Patterns to generate flat for global +//===----------------------------------------------------------------------===// + +def useFlatForGlobal : Predicate < + "Subtarget->useFlatForGlobal() || " + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">; + +let Predicates = [useFlatForGlobal] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +// Patterns for global loads with no offset +class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; + +class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; + +def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + +} // End Predicates = [useFlatForGlobal] diff --git a/llvm/lib/Target/AMDGPU/VIInstructions.td b/llvm/lib/Target/AMDGPU/VIInstructions.td index 9d29e4700f1..20a026a822e 100644 --- a/llvm/lib/Target/AMDGPU/VIInstructions.td +++ b/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -101,58 +101,3 @@ def S_DCACHE_WB_VOL : SMEM_Inval <0x23, } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI -//===----------------------------------------------------------------------===// -// SMEM Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isVI] in { - -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; - -// Patterns for global loads with no offset -class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) ->; - -def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; - -class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (node vt:$data, i64:$addr), - (inst $data, $addr, 0, 0, 0) ->; - -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; - -class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr, vt:$data)), - (inst $addr, $data, 0, 0) ->; - -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; - - -} // End Predicates = [isVI] diff --git a/llvm/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll b/llvm/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll new file mode 100644 index 00000000000..1a37e3c75fa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s + + +; HSA-DEFAULT: flat_store_dword +; HSA-NODEFAULT: buffer_store_dword +; NOHSA-DEFAULT: buffer_store_dword +; NOHSA-NODEFAULT: flat_store_dword +define void @test(i32 addrspace(1)* %out) { +entry: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index d9bb586163d..abc89b7fd83 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -1,6 +1,8 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI --check-prefix=HSA %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck --check-prefix=HSA-CI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA-VI %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF ; The SHT_NOTE section contains the output from the .hsa_code_object_* @@ -47,7 +49,8 @@ ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 ; On VI+ we also need to set MTYPE = 2 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000 -; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0 +; Make sure we generate flat store for HSA +; HSA: flat_store_dword v{{[0-9]+}} define void @simple(i32 addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index de802c49ed4..8347b8c96ec 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll index cc109327d92..014b08502b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}test_debug_value: ; CHECK: s_load_dwordx2 s[4:5] ; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5 -; CHECK: buffer_store_dword +; CHECK: flat_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll index f4fa6211210..e9f641b736d 100644 --- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN %s ; Check that when mubuf addr64 instruction is handled in moveToVALU ; from the pointer, dead register writes are not emitted. diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll index 8c104d9e34b..4bb315049be 100644 --- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll +++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -asm-verbose -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.SI.tid() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll index f420ec9c7d2..e7fcd1ff365 100644 --- a/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -129,7 +129,8 @@ entry: ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}} ; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}} -; GCN: buffer_store_dword [[VVAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +; HSA: flat_store_dword [[VVAL]] ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 @@ -155,7 +156,8 @@ entry: ; HSA: enable_sgpr_grid_workgroup_count_z = 0 ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3 ; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7 -; GCN: buffer_store_dword [[VVAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +; HSA: flat_store_dword [[VVAL]] ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 @@ -190,7 +192,8 @@ entry: ; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}} ; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}} -; GCN: buffer_store_dword [[VVAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +; HSA: flat_store_dword [[VVAL]] ; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 @@ -211,7 +214,8 @@ entry: ; FUNC-LABEL: {{^}}tidig_x: ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 -; GCN: buffer_store_dword v0 +; GCN-NOHSA: buffer_store_dword v0 +; HSA: flat_store_dword v0 define void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 @@ -226,7 +230,8 @@ entry: ; FUNC-LABEL: {{^}}tidig_y: ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1 -; GCN: buffer_store_dword v1 +; GCN-NOHSA: buffer_store_dword v1 +; HSA: flat_store_dword v1 define void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 @@ -240,7 +245,8 @@ entry: ; FUNC-LABEL: {{^}}tidig_z: ; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2 -; GCN: buffer_store_dword v2 +; GCN-NOHSA: buffer_store_dword v2 +; HSA: flat_store_dword v2 define void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 |