diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 209 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/BUFInstructions.td | 75 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 51 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 101 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 82 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 127 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-hi16.ll | 15 |
13 files changed, 490 insertions, 198 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 159be2070ac..39fb3949dae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -691,7 +691,7 @@ def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<"!FeatureUnpackedD16VMem">; def D16PreservesUnusedBits : - Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">, + Predicate<"Subtarget->d16PreservesUnusedBits()">, AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">; def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2cdd691fc10..c62420ec032 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -51,6 +51,8 @@ #include <new> #include <vector> +#define DEBUG_TYPE "isel" + using namespace llvm; namespace llvm { @@ -88,7 +90,10 @@ public: SelectionDAGISel::getAnalysisUsage(AU); } + bool matchLoadD16FromBuildVector(SDNode *N) const; + bool runOnMachineFunction(MachineFunction &MF) override; + void PreprocessISelDAG() override; void Select(SDNode *N) override; StringRef getPassName() const override; void PostprocessISelDAG() override; @@ -193,6 +198,7 @@ private: bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + SDValue getHi16Elt(SDValue In) const; bool SelectHi16Elt(SDValue In, SDValue &Src) const; void SelectADD_SUB_I64(SDNode *N); @@ -236,11 +242,49 @@ public: SDValue &Offset) override; bool runOnMachineFunction(MachineFunction &MF) override; + + void PreprocessISelDAG() override {} + protected: // Include the pieces autogenerated from the target description. #include "R600GenDAGISel.inc" }; +static SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; +} + +// Figure out if this is really an extract of the high 16-bits of a dword. +static bool isExtractHiElt(SDValue In, SDValue &Out) { + In = stripBitcast(In); + if (In.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue Srl = In.getOperand(0); + if (Srl.getOpcode() == ISD::SRL) { + if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { + if (ShiftAmt->getZExtValue() == 16) { + Out = stripBitcast(Srl.getOperand(0)); + return true; + } + } + } + + return false; +} + +// Look through operations that obscure just looking at the low 16-bits of the +// same register. +static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::TRUNCATE) { + SDValue Src = In.getOperand(0); + if (Src.getValueType().getSizeInBits() == 32) + return stripBitcast(Src); + } + + return In; +} + } // end anonymous namespace INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", @@ -270,6 +314,114 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { + assert(Subtarget->d16PreservesUnusedBits()); + MVT VT = N->getValueType(0).getSimpleVT(); + if (VT != MVT::v2i16 && VT != MVT::v2f16) + return false; + + SDValue Lo = N->getOperand(0); + SDValue Hi = N->getOperand(1); + + LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi)); + + // build_vector lo, (load ptr) -> load_d16_hi ptr, lo + // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo + // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo + + // Need to check for possible indirect dependencies on the other half of the + // vector to avoid introducing a cycle. + if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); + + SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); + SDValue Ops[] = { + LdHi->getChain(), LdHi->getBasePtr(), TiedIn + }; + + unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; + if (LdHi->getMemoryVT() == MVT::i8) { + LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? + AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; + } else { + assert(LdHi->getMemoryVT() == MVT::i16); + } + + SDValue NewLoadHi = + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, + Ops, LdHi->getMemoryVT(), + LdHi->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); + return true; + } + + // build_vector (load ptr), hi -> load_d16_lo ptr, hi + // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi + // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi + LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo)); + if (LdLo && Lo.hasOneUse()) { + SDValue TiedIn = getHi16Elt(Hi); + if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) + return false; + + SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); + unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; + if (LdLo->getMemoryVT() == MVT::i8) { + LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? + AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; + } else { + assert(LdLo->getMemoryVT() == MVT::i16); + } + + TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); + + SDValue Ops[] = { + LdLo->getChain(), LdLo->getBasePtr(), TiedIn + }; + + SDValue NewLoadLo = + CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, + Ops, LdLo->getMemoryVT(), + LdLo->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); + return true; + } + + return false; +} + +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + if (!Subtarget->d16PreservesUnusedBits()) + return; + + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); + + bool MadeChange = false; + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + if (N->use_empty()) + continue; + + switch (N->getOpcode()) { + case ISD::BUILD_VECTOR: + MadeChange |= matchLoadD16FromBuildVector(N); + break; + default: + break; + } + } + + if (MadeChange) { + CurDAG->RemoveDeadNodes(); + LLVM_DEBUG(dbgs() << "After PreProcess:\n"; + CurDAG->dump();); + } +} + bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { if (TM.Options.NoNaNsFPMath) return true; @@ -1889,41 +2041,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, return true; } -static SDValue stripBitcast(SDValue Val) { - return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; -} - -// Figure out if this is really an extract of the high 16-bits of a dword. -static bool isExtractHiElt(SDValue In, SDValue &Out) { - In = stripBitcast(In); - if (In.getOpcode() != ISD::TRUNCATE) - return false; - - SDValue Srl = In.getOperand(0); - if (Srl.getOpcode() == ISD::SRL) { - if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { - if (ShiftAmt->getZExtValue() == 16) { - Out = stripBitcast(Srl.getOperand(0)); - return true; - } - } - } - - return false; -} - -// Look through operations that obscure just looking at the low 16-bits of the -// same register. -static SDValue stripExtractLoElt(SDValue In) { - if (In.getOpcode() == ISD::TRUNCATE) { - SDValue Src = In.getOperand(0); - if (Src.getValueType().getSizeInBits() == 32) - return stripBitcast(Src); - } - - return In; -} - bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; @@ -2076,6 +2193,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, return true; } +SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { + if (In.isUndef()) + return CurDAG->getUNDEF(MVT::i32); + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { + SDLoc SL(In); + return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); + } + + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { + SDLoc SL(In); + return CurDAG->getConstant( + C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); + } + + SDValue Src; + if (isExtractHiElt(In, Src)) + return Src; + + return SDValue(); +} + // TODO: Can we identify things like v_mad_mixhi_f16? bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { if (In.isUndef()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 68171179c46..141b76cc303 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4186,6 +4186,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(INTERP_P1LL_F16) NODE_NAME_CASE(INTERP_P1LV_F16) NODE_NAME_CASE(INTERP_P2_F16) + NODE_NAME_CASE(LOAD_D16_HI) + NODE_NAME_CASE(LOAD_D16_LO) + NODE_NAME_CASE(LOAD_D16_HI_I8) + NODE_NAME_CASE(LOAD_D16_HI_U8) + NODE_NAME_CASE(LOAD_D16_LO_I8) + NODE_NAME_CASE(LOAD_D16_LO_U8) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 359e16cfa56..fa05ee52a8f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -469,6 +469,13 @@ enum NodeType : unsigned { KILL, DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + LOAD_D16_HI, + LOAD_D16_LO, + LOAD_D16_HI_I8, + LOAD_D16_HI_U8, + LOAD_D16_LO_I8, + LOAD_D16_LO_U8, + STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index bdbfdaf1dc1..7b981ea5639 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -802,7 +802,7 @@ multiclass IntMed3Pat<Instruction med3Inst, SDPatternOperator max_oneuse, ValueType vt = i32> { - // This matches 16 permutations of + // This matches 16 permutations of // min(max(a, b), max(min(a, b), c)) def : AMDGPUPat < (min (max_oneuse vt:$src0, vt:$src1), @@ -810,7 +810,7 @@ multiclass IntMed3Pat<Instruction med3Inst, (med3Inst vt:$src0, vt:$src1, vt:$src2) >; - // This matches 16 permutations of + // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) def : AMDGPUPat < (max (min_oneuse vt:$src0, vt:$src1), @@ -818,7 +818,7 @@ multiclass IntMed3Pat<Instruction med3Inst, (med3Inst $src0, $src1, $src2) >; } - + // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 0002e8e51be..91cc44cbd11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -614,6 +614,10 @@ public: return getGeneration() >= GFX9; } + bool d16PreservesUnusedBits() const { + return hasD16LoadStore() && !isSRAMECCEnabled(); + } + /// Return if most LDS instructions have an m0 use that require m0 to be /// iniitalized. bool ldsRequiresM0Init() const { diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 20cc79ddaed..902cc3e0d4b 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1376,60 +1376,17 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, } // XXX - Is it possible to have a complex pattern in a PatFrag? -multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen, +multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen, MUBUF_Pseudo InstrOffset, - ValueType vt, PatFrag ld> { - def : GCNPat < - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset)))), - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset)))))), - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - - - def : GCNPat < - (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))), - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; - + ValueType vt, PatFrag ld_frag> { def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))), - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) - >; -} - -multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen, - MUBUF_Pseudo InstrOffset, - ValueType vt, PatFrag ld> { - def : GCNPat < - (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (vt (Hi16Elt vt:$hi))), - (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $in) >; def : GCNPat < - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))))), - (f16 (Hi16Elt f16:$hi))), - (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) - >; - - def : GCNPat < - (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (vt (Hi16Elt vt:$hi))), - (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))), - (f16 (Hi16Elt f16:$hi))), - (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $in) >; } @@ -1445,13 +1402,19 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSE defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; let OtherPredicates = [D16PreservesUnusedBits] in { -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>; -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>; -defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>; - -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>; -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>; -defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>; + +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>; +defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>; } multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index d255763d39a..246be340c32 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -611,30 +611,10 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { } } - -multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> { - def : GCNPat < - (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))), - (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) - >; - - def : GCNPat < - (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))), - (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) - >; -} - -multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> { - def : GCNPat < - (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi)) - >; -} +class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), + (inst $ptr, (as_i16imm $offset), (i1 0), $in) +>; defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">; defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">; @@ -656,16 +636,19 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">; } // End AddedComplexity = 100 let OtherPredicates = [D16PreservesUnusedBits] in { -let AddedComplexity = 100 in { -defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>; -defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>; -defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>; - -defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>; -defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>; -defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>; - -} +def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>; +def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>; +def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>; +def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2f16>; +def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2i16>; +def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2f16>; + +def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2i16>; +def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2f16>; +def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2i16>; +def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2f16>; +def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>; +def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>; } class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 2179b21e0a6..cefcf90d0d2 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -663,53 +663,15 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN (inst $vaddr, $offset, 0, $slc) >; -multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; - - def : GCNPat < - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; -} - -multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; - - def : GCNPat < - (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) - >; -} - -multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; - - def : GCNPat < - (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; -} - -multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { - def : GCNPat < - (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), - (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; +class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc), vt:$in), + (inst $vaddr, $offset, 0, $slc, $in) +>; - def : GCNPat < - (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), - (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) - >; -} +class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc), vt:$in), + (inst $vaddr, $offset, 0, $slc, $in) +>; class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), @@ -817,17 +779,19 @@ let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; -let AddedComplexity = 3 in { -defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>; -defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>; -defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>; -} - -let AddedComplexity = 9 in { -defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>; -defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>; -defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>; -} +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; + +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; +def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } } // End OtherPredicates = [HasFlatAddressSpace] @@ -861,14 +825,19 @@ let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>; -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>; -defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>; - -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>; -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>; -defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>; - +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>; + +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>; +def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>; } def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>; @@ -902,7 +871,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; -} // End OtherPredicates = [HasFlatGlobalInsts] +} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 8310b42d28b..c5811e47145 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +// load_d16_{lo|hi} ptr, tied_input +def SIload_d16 : SDTypeProfile<1, 2, [ + SDTCisPtrTy<1>, + SDTCisSameAs<0, 2> +]>; + + def SDTtbuffer_load : SDTypeProfile<1, 8, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -187,6 +194,36 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; +def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8", + SIload_d16, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +>; + //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// @@ -384,6 +421,51 @@ def si_setcc_uniform : PatFrag < return true; }]>; +//===----------------------------------------------------------------------===// +// SDNodes PatFrags for d16 loads +//===----------------------------------------------------------------------===// + +class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>; +class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress; +class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress; +class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress; +class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress; + +def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>; + +def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>; + +def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>; + +def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>; +def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>; +def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>; + + +def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>; + +def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>; + +def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>; + +def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>; +def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>; +def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>; + + + def lshr_rev : PatFrag < (ops node:$src1, node:$src0), (srl $src0, $src1) diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll index fd81c0438d6..433baf43861 100644 --- a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll @@ -4,9 +4,8 @@ ; combine and a generic insert_vector_elt combine. ; GCN-LABEL: {{^}}combine_loop: -; GCN: flat_load_ushort +; GCN: flat_load_short_d16_hi ; GCN: flat_store_short -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, define amdgpu_kernel void @combine_loop(i16* %arg) #0 { bb: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 696b33e75fe..5fd4e065ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s ; GCN-LABEL: {{^}}chain_hi_to_lo_private: ; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2 @@ -175,3 +175,128 @@ entry: %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* ret void } + +; There is another instruction between the misordered instruction and +; the value dependent load, so a simple operand check is insufficient. +; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep: +; GFX900: ds_read_u16_d16_hi v1, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] +; GFX900-NEXT: ds_read_u16_d16 v1, v0 offset:2 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 +define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) { +bb: + %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 + %load_lo = load i16, i16 addrspace(3)* %gep_lo + %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 + %load_hi = load i16, i16 addrspace(3)* %gep_hi + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 + ret <2 x i16> %result +} + +; The volatile operations aren't put on the same chain +; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep_multi_chain: +; GFX900: ds_read_u16 v1, v0 offset:2 +; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 +; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v1, v0 +; GFX900-NEXT: s_setpc_b64 +define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { +bb: + %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 + %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo + %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 + %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 + ret <2 x i16> %result +} + +; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep: +; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0] +; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 +define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) { +bb: + %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 + %load_lo = load i16, i16 addrspace(5)* %gep_lo + %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0 + %load_hi = load i16, i16 addrspace(5)* %gep_hi + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 + ret <2 x i16> %result +} + +; GCN-LABEL: {{^}}chain_hi_to_lo_global_other_dep: +; GFX900: global_load_ushort v2, v[0:1], off offset:2 +; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off +; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0 +; GFX900-NEXT: s_setpc_b64 +define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { +bb: + %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 + %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo + %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0 + %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 + ret <2 x i16> %result +} + +; GCN-LABEL: {{^}}chain_hi_to_lo_flat_other_dep: +; GFX900: flat_load_ushort v2, v[0:1] offset:2 +; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] +; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0 +; GFX900-NEXT: s_setpc_b64 +define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { +bb: + %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 + %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo + %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0 + %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 + %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> + %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 + ret <2 x i16> %result +} + +; GCN-LABEL: {{^}}chain_hi_to_lo_group_may_alias_store: +; GFX900: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b +; GFX900-NEXT: ds_read_u16 v3, v0 +; GFX900-NEXT: ds_write_b16 v1, [[K]] +; GFX900-NEXT: ds_read_u16 v0, v0 offset:2 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX900-NEXT: s_setpc_b64 +define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { +bb: + %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 + %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 + %load_hi = load i16, i16 addrspace(3)* %gep_hi + store i16 123, i16 addrspace(3)* %may.alias + %load_lo = load i16, i16 addrspace(3)* %gep_lo + + %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 + %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0 + ret <2 x i16> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index ee5737c2065..357ce3d9a9a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -880,6 +880,21 @@ entry: ret <2 x i16> %build1 } +; FIXME: Remove and +; GCN-LABEL: {{^}}load_local_v2i16_broadcast: +; GCN: ds_read_u16 [[LOAD:v[0-9]+]] +; GCN-NOT: ds_read +; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]] +; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]] +define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 + %load0 = load i16, i16 addrspace(3)* %in + %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1 + ret <2 x i16> %build1 +} + ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect: ; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0 ; GFX900: ds_write_b16 |