diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td | 141 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 275 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 18 |
4 files changed, 444 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 26b8b784027..8d70536ec21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -30,6 +30,147 @@ foreach intr = !listconcat(AMDGPUBufferIntrinsics, def : RsrcIntrinsic<!cast<AMDGPURsrcIntrinsic>(intr)>; } +class GcnBufferFormatBase<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> { + bits<8> Format = f; + bits<8> BitsPerComp = bpc; + bits<8> NumComponents = numc; + bits<8> NumFormat = nfmt; + bits<8> DataFormat = dfmt; +} + +class Gfx9BufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>; +class Gfx10PlusBufferFormat<bits<8> f, bits<8> bpc, bits<8> numc, bits<8> nfmt, bits<8> dfmt> : GcnBufferFormatBase<f, bpc, numc, nfmt, dfmt>; + +class GcnBufferFormatTable : GenericTable { + let CppTypeName = "GcnBufferFormatInfo"; + let Fields = ["Format", "BitsPerComp", "NumComponents", "NumFormat", "DataFormat"]; + let PrimaryKey = ["BitsPerComp", "NumComponents", "NumFormat"]; +} + +def Gfx9BufferFormat : GcnBufferFormatTable { + let FilterClass = "Gfx9BufferFormat"; + let PrimaryKeyName = "getGfx9BufferFormatInfo"; +} +def Gfx10PlusBufferFormat : GcnBufferFormatTable { + let FilterClass = "Gfx10PlusBufferFormat"; + let PrimaryKeyName = "getGfx10PlusBufferFormatInfo"; +} + +def getGfx9BufferFormatInfo : SearchIndex { + let Table = Gfx9BufferFormat; + let Key = ["Format"]; +} +def getGfx10PlusBufferFormatInfo : SearchIndex { + let Table = Gfx10PlusBufferFormat; + let Key = ["Format"]; +} + +// Buffer formats with equal component sizes (GFX9 and earlier) +def : Gfx9BufferFormat< /*FORMAT_8_UNORM*/ 0x01, 8, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8*/ 1>; +def : Gfx9BufferFormat< /*FORMAT_8_SNORM*/ 0x11, 8, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8*/ 1>; +def : Gfx9BufferFormat< /*FORMAT_8_USCALED*/ 0x21, 8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/ 1>; +def : Gfx9BufferFormat< /*FORMAT_8_SSCALED*/ 0x31, 8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/ 1>; +def : Gfx9BufferFormat< /*FORMAT_8_UINT*/ 0x41, 8, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8*/ 1>; +def : Gfx9BufferFormat< /*FORMAT_8_SINT*/ 0x51, 8, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8*/ 1>; +def : Gfx9BufferFormat< /*FORMAT_16_UNORM*/ 0x02, 16, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16*/ 2>; +def : Gfx9BufferFormat< /*FORMAT_16_SNORM*/ 0x12, 16, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16*/ 2>; +def : Gfx9BufferFormat< /*FORMAT_16_USCALED*/ 0x22, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/ 2>; +def : Gfx9BufferFormat< /*FORMAT_16_SSCALED*/ 0x32, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/ 2>; +def : Gfx9BufferFormat< /*FORMAT_16_UINT*/ 0x42, 16, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16*/ 2>; +def : Gfx9BufferFormat< /*FORMAT_16_SINT*/ 0x52, 16, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16*/ 2>; +def : Gfx9BufferFormat< /*FORMAT_16_FLOAT*/ 0x72, 16, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16*/ 2>; +def : Gfx9BufferFormat< /*FORMAT_8_8_UNORM*/ 0x03, 8, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx9BufferFormat< /*FORMAT_8_8_SNORM*/ 0x13, 8, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx9BufferFormat< /*FORMAT_8_8_USCALED*/ 0x23, 8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx9BufferFormat< /*FORMAT_8_8_SSCALED*/ 0x33, 8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx9BufferFormat< /*FORMAT_8_8_UINT*/ 0x43, 8, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx9BufferFormat< /*FORMAT_8_8_SINT*/ 0x53, 8, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx9BufferFormat< /*FORMAT_32_UINT*/ 0x44, 32, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32*/ 4>; +def : Gfx9BufferFormat< /*FORMAT_32_SINT*/ 0x54, 32, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32*/ 4>; +def : Gfx9BufferFormat< /*FORMAT_32_FLOAT*/ 0x74, 32, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32*/ 4>; +def : Gfx9BufferFormat< /*FORMAT_16_16_UNORM*/ 0x05, 16, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx9BufferFormat< /*FORMAT_16_16_SNORM*/ 0x15, 16, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx9BufferFormat< /*FORMAT_16_16_USCALED*/ 0x25, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx9BufferFormat< /*FORMAT_16_16_SSCALED*/ 0x35, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx9BufferFormat< /*FORMAT_16_16_UINT*/ 0x45, 16, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx9BufferFormat< /*FORMAT_16_16_SINT*/ 0x55, 16, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx9BufferFormat< /*FORMAT_16_16_FLOAT*/ 0x75, 16, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx9BufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x0A, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx9BufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x1A, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx9BufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x2A, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx9BufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x3A, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx9BufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x4A, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx9BufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x5A, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx9BufferFormat< /*FORMAT_32_32_UINT*/ 0x4B, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx9BufferFormat< /*FORMAT_32_32_SINT*/ 0x5B, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx9BufferFormat< /*FORMAT_32_32_FLOAT*/ 0x7B, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx9BufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x0C, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx9BufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x1C, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx9BufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x2C, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx9BufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x3C, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx9BufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x4C, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx9BufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x5C, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx9BufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x7C, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx9BufferFormat< /*FORMAT_32_32_32_UINT*/ 0x4D, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx9BufferFormat< /*FORMAT_32_32_32_SINT*/ 0x5D, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx9BufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x7D, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x4E, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x5E, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx9BufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x7E, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>; + +// Buffer formats with equal component sizes (GFX10 and later) +def : Gfx10PlusBufferFormat< /*FORMAT_8_UNORM*/ 0x01, 8, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8*/ 1>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_SNORM*/ 0x02, 8, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8*/ 1>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_USCALED*/ 0x03, 8, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8*/ 1>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_SSCALED*/ 0x04, 8, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8*/ 1>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_UINT*/ 0x05, 8, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8*/ 1>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_SINT*/ 0x06, 8, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8*/ 1>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_UNORM*/ 0x07, 16, 1, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16*/ 2>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_SNORM*/ 0x08, 16, 1, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16*/ 2>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_USCALED*/ 0x09, 16, 1, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16*/ 2>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_SSCALED*/ 0x0A, 16, 1, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16*/ 2>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_UINT*/ 0x0B, 16, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16*/ 2>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_SINT*/ 0x0C, 16, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16*/ 2>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_FLOAT*/ 0x0D, 16, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16*/ 2>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UNORM*/ 0x0E, 8, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SNORM*/ 0x0F, 8, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_USCALED*/ 0x10, 8, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SSCALED*/ 0x11, 8, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_UINT*/ 0x12, 8, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_SINT*/ 0x13, 8, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8*/ 3>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_UINT*/ 0x14, 32, 1, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32*/ 4>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_SINT*/ 0x15, 32, 1, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32*/ 4>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_FLOAT*/ 0x16, 32, 1, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32*/ 4>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UNORM*/ 0x17, 16, 2, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SNORM*/ 0x18, 16, 2, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_USCALED*/ 0x19, 16, 2, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SSCALED*/ 0x1A, 16, 2, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_UINT*/ 0x1B, 16, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_SINT*/ 0x1C, 16, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_FLOAT*/ 0x1D, 16, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16*/ 5>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UNORM*/ 0x38, 8, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SNORM*/ 0x39, 8, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_USCALED*/ 0x3A, 8, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SSCALED*/ 0x3B, 8, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_UINT*/ 0x3C, 8, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10PlusBufferFormat< /*FORMAT_8_8_8_8_SINT*/ 0x3D, 8, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_8_8_8_8*/ 10>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_UINT*/ 0x3E, 32, 2, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_SINT*/ 0x3F, 32, 2, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_FLOAT*/ 0x40, 32, 2, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32*/ 11>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UNORM*/ 0x41, 16, 4, /*NUM_FORMAT_UNORM*/ 0, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SNORM*/ 0x42, 16, 4, /*NUM_FORMAT_SNORM*/ 1, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_USCALED*/ 0x43, 16, 4, /*NUM_FORMAT_USCALED*/ 2, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SSCALED*/ 0x44, 16, 4, /*NUM_FORMAT_SSCALED*/ 3, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_UINT*/ 0x45, 16, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_SINT*/ 0x46, 16, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10PlusBufferFormat< /*FORMAT_16_16_16_16_FLOAT*/ 0x47, 16, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_16_16_16_16*/ 12>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_UINT*/ 0x48, 32, 3, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_SINT*/ 0x49, 32, 3, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_FLOAT*/ 0x4A, 32, 3, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32*/ 13>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_UINT*/ 0x4B, 32, 4, /*NUM_FORMAT_UINT*/ 4, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_SINT*/ 0x4C, 32, 4, /*NUM_FORMAT_SINT*/ 5, /*DATA_FORMAT_32_32_32_32*/ 14>; +def : Gfx10PlusBufferFormat< /*FORMAT_32_32_32_32_FLOAT*/ 0x4D, 32, 4, /*NUM_FORMAT_FLOAT*/ 7, /*DATA_FORMAT_32_32_32_32*/ 14>; + class SourceOfDivergence<Intrinsic intr> { Intrinsic Intr = intr; } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 24769a89b9b..1d64bf892ed 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -99,6 +99,8 @@ enum InstClassEnum { BUFFER_LOAD, BUFFER_STORE, MIMG, + TBUFFER_LOAD, + TBUFFER_STORE, }; enum RegisterEnum { @@ -119,6 +121,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned Offset1; unsigned Width0; unsigned Width1; + unsigned Format0; + unsigned Format1; unsigned BaseOff; unsigned DMask0; unsigned DMask1; @@ -206,12 +210,14 @@ private: const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; + const MCSubtargetInfo *STI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; bool OptimizeAgain; - static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII); - static bool offsetsCanBeCombined(CombineInfo &CI); + static bool dmasksCanBeCombined(const CombineInfo &CI, + const SIInstrInfo &TII); + static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); static unsigned getNewOpcode(const CombineInfo &CI); static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); @@ -230,6 +236,8 @@ private: MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI); + MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI); void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, int32_t NewOffset) const; @@ -285,6 +293,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); return countPopulation(DMaskImm); } + if (TII.isMTBUF(Opc)) { + return AMDGPU::getMTBUFElements(Opc); + } switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: @@ -323,10 +334,27 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) return UNKNOWN; // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. - if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) + if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || + TII.isGather4(Opc)) return UNKNOWN; return MIMG; } + if (TII.isMTBUF(Opc)) { + switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { + default: + return UNKNOWN; + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: + return TBUFFER_LOAD; + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: + case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: + return TBUFFER_STORE; + } + } return UNKNOWN; case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: @@ -357,6 +385,8 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { assert(Info); return Info->BaseOpcode; } + if (TII.isMTBUF(Opc)) + return AMDGPU::getMTBUFBaseOpcode(Opc); return -1; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -398,6 +428,24 @@ static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) result |= SSAMP; + + return result; + } + if (TII.isMTBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMTBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMTBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMTBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + return result; } @@ -420,7 +468,6 @@ static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { } } - void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, const GCNSubtarget &STM) { @@ -457,6 +504,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, Offset0 = I->getOperand(OffsetIdx).getImm(); } + if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) + Format0 = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); + Width0 = getOpcodeWidth(*I, TII); if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { @@ -518,6 +568,9 @@ void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI Offset1 = Paired->getOperand(OffsetIdx).getImm(); } + if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) + Format1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::format)->getImm(); + Width1 = getOpcodeWidth(*Paired, TII); if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { Offset1 &= 0xffff; @@ -530,7 +583,6 @@ void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI } } - } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, @@ -671,7 +723,33 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIIn return true; } -bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { +static unsigned getBufferFormatWithCompCount(unsigned OldFormat, + unsigned ComponentCount, + const MCSubtargetInfo &STI) { + if (ComponentCount > 4) + return 0; + + const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = + llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); + if (!OldFormatInfo) + return 0; + + const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = + llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, + ComponentCount, + OldFormatInfo->NumFormat, STI); + + if (!NewFormatInfo) + return 0; + + assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && + NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); + + return NewFormatInfo->Format; +} + +bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, + const MCSubtargetInfo &STI) { assert(CI.InstClass != MIMG); // XXX - Would the same offset be OK? Is there any reason this would happen or @@ -683,6 +761,30 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) return false; + if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { + + const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = + llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format0, STI); + if (!Info0) + return false; + const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = + llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format1, STI); + if (!Info1) + return false; + + if (Info0->BitsPerComp != Info1->BitsPerComp || + Info0->NumFormat != Info1->NumFormat) + return false; + + // TODO: Should be possible to support more formats, but if format loads + // are not dword-aligned, the merged load might not be valid. + if (Info0->BitsPerComp != 32) + return false; + + if (getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, STI) == 0) + return false; + } + unsigned EltOffset0 = CI.Offset0 / CI.EltSize; unsigned EltOffset1 = CI.Offset1 / CI.EltSize; CI.UseST64 = false; @@ -814,6 +916,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { if (MBBI->hasOrderedMemoryRef()) return false; + int Swizzled = + AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) + return false; + // Handle a case like // DS_WRITE_B32 addr, v, idx0 // w = DS_READ_B32 addr, idx0 @@ -834,7 +941,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { bool canBeCombined = CI.InstClass == MIMG ? dmasksCanBeCombined(CI, *TII) - : widthsFit(*STM, CI) && offsetsCanBeCombined(CI); + : widthsFit(*STM, CI) && offsetsCanBeCombined(CI, *STI); // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged @@ -1201,6 +1308,136 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { return New; } +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. + Register DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); + + const unsigned Regs = getRegs(Opcode, *TII); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + unsigned JoinedFormat = + getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(JoinedFormat) // format + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand( + combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return New; +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + + const unsigned Opcode = getNewOpcode(CI); + + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); + + // Copy to the new source register. + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + Register SrcReg = MRI->createVirtualRegister(SuperRC); + + const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) + .addReg(SrcReg, RegState::Kill); + + const unsigned Regs = getRegs(Opcode, *TII); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + unsigned JoinedFormat = + getBufferFormatWithCompCount(CI.Format0, CI.Width0 + CI.Width1, *STI); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(JoinedFormat) // format + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand( + combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + + moveInstsAfter(MIB, CI.InstsToMove); + + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return New; +} + unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { const unsigned Width = CI.Width0 + CI.Width1; @@ -1210,6 +1447,11 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { // FIXME: Handle d16 correctly return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), Width); + case TBUFFER_LOAD: + case TBUFFER_STORE: + return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), + Width); + case UNKNOWN: llvm_unreachable("Unknown instruction class"); case S_BUFFER_LOAD_IMM: @@ -1819,6 +2061,24 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } break; + case TBUFFER_LOAD: + if (findMatchingInst(CI)) { + Modified = true; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + } + break; + case TBUFFER_STORE: + if (findMatchingInst(CI)) { + Modified = true; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; + } + break; } // Clear the InstsToMove after we have finished searching so we don't have // stale values left over if we search for this CI again in another pass @@ -1839,6 +2099,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { TII = STM->getInstrInfo(); TRI = &TII->getRegisterInfo(); + STI = &MF.getSubtarget<MCSubtargetInfo>(); MRI = &MF.getRegInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a4b216f583d..81d3697e79b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1328,6 +1328,8 @@ struct SourceOfDivergence { const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr); #define GET_SourcesOfDivergence_IMPL +#define GET_Gfx9BufferFormat_IMPL +#define GET_Gfx10PlusBufferFormat_IMPL #include "AMDGPUGenSearchableTables.inc" } // end anonymous namespace @@ -1336,5 +1338,21 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID) { return lookupSourceOfDivergence(IntrID); } +const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, + uint8_t NumComponents, + uint8_t NumFormat, + const MCSubtargetInfo &STI) { + return isGFX10(STI) + ? getGfx10PlusBufferFormatInfo(BitsPerComp, NumComponents, + NumFormat) + : getGfx9BufferFormatInfo(BitsPerComp, NumComponents, NumFormat); +} + +const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, + const MCSubtargetInfo &STI) { + return isGFX10(STI) ? getGfx10PlusBufferFormatInfo(Format) + : getGfx9BufferFormatInfo(Format); +} + } // namespace AMDGPU } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 05bb39235a4..a5bada2890d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -41,6 +41,14 @@ class Triple; namespace AMDGPU { +struct GcnBufferFormatInfo { + unsigned Format; + unsigned BitsPerComp; + unsigned NumComponents; + unsigned NumFormat; + unsigned DataFormat; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -300,6 +308,15 @@ LLVM_READONLY bool getMUBUFHasSoffset(unsigned Opc); LLVM_READONLY +const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, + uint8_t NumComponents, + uint8_t NumFormat, + const MCSubtargetInfo &STI); +LLVM_READONLY +const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format, + const MCSubtargetInfo &STI); + +LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, @@ -646,7 +663,6 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); - // Track defaults for fields in the MODE registser. struct SIModeRegisterDefaults { /// Floating point opcodes that support exception flag gathering quiet and |