summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp209
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h4
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td75
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td51
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td101
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td82
-rw-r--r--llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll127
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-hi16.ll15
13 files changed, 490 insertions, 198 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 159be2070ac..39fb3949dae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -691,7 +691,7 @@ def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<"!FeatureUnpackedD16VMem">;
def D16PreservesUnusedBits :
- Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
+ Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2cdd691fc10..c62420ec032 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -51,6 +51,8 @@
#include <new>
#include <vector>
+#define DEBUG_TYPE "isel"
+
using namespace llvm;
namespace llvm {
@@ -88,7 +90,10 @@ public:
SelectionDAGISel::getAnalysisUsage(AU);
}
+ bool matchLoadD16FromBuildVector(SDNode *N) const;
+
bool runOnMachineFunction(MachineFunction &MF) override;
+ void PreprocessISelDAG() override;
void Select(SDNode *N) override;
StringRef getPassName() const override;
void PostprocessISelDAG() override;
@@ -193,6 +198,7 @@ private:
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ SDValue getHi16Elt(SDValue In) const;
bool SelectHi16Elt(SDValue In, SDValue &Src) const;
void SelectADD_SUB_I64(SDNode *N);
@@ -236,11 +242,49 @@ public:
SDValue &Offset) override;
bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void PreprocessISelDAG() override {}
+
protected:
// Include the pieces autogenerated from the target description.
#include "R600GenDAGISel.inc"
};
+static SDValue stripBitcast(SDValue Val) {
+ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+}
+
+// Figure out if this is really an extract of the high 16-bits of a dword.
+static bool isExtractHiElt(SDValue In, SDValue &Out) {
+ In = stripBitcast(In);
+ if (In.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue Srl = In.getOperand(0);
+ if (Srl.getOpcode() == ISD::SRL) {
+ if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ if (ShiftAmt->getZExtValue() == 16) {
+ Out = stripBitcast(Srl.getOperand(0));
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+// Look through operations that obscure just looking at the low 16-bits of the
+// same register.
+static SDValue stripExtractLoElt(SDValue In) {
+ if (In.getOpcode() == ISD::TRUNCATE) {
+ SDValue Src = In.getOperand(0);
+ if (Src.getValueType().getSizeInBits() == 32)
+ return stripBitcast(Src);
+ }
+
+ return In;
+}
+
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
@@ -270,6 +314,114 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
+ assert(Subtarget->d16PreservesUnusedBits());
+ MVT VT = N->getValueType(0).getSimpleVT();
+ if (VT != MVT::v2i16 && VT != MVT::v2f16)
+ return false;
+
+ SDValue Lo = N->getOperand(0);
+ SDValue Hi = N->getOperand(1);
+
+ LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
+
+ // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
+ // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
+ // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
+
+ // Need to check for possible indirect dependencies on the other half of the
+ // vector to avoid introducing a cycle.
+ if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
+ SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
+
+ SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
+ SDValue Ops[] = {
+ LdHi->getChain(), LdHi->getBasePtr(), TiedIn
+ };
+
+ unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
+ if (LdHi->getMemoryVT() == MVT::i8) {
+ LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
+ } else {
+ assert(LdHi->getMemoryVT() == MVT::i16);
+ }
+
+ SDValue NewLoadHi =
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
+ Ops, LdHi->getMemoryVT(),
+ LdHi->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
+ return true;
+ }
+
+ // build_vector (load ptr), hi -> load_d16_lo ptr, hi
+ // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
+ // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
+ LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
+ if (LdLo && Lo.hasOneUse()) {
+ SDValue TiedIn = getHi16Elt(Hi);
+ if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
+ return false;
+
+ SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
+ unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
+ if (LdLo->getMemoryVT() == MVT::i8) {
+ LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
+ } else {
+ assert(LdLo->getMemoryVT() == MVT::i16);
+ }
+
+ TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
+
+ SDValue Ops[] = {
+ LdLo->getChain(), LdLo->getBasePtr(), TiedIn
+ };
+
+ SDValue NewLoadLo =
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
+ Ops, LdLo->getMemoryVT(),
+ LdLo->getMemOperand());
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
+ return true;
+ }
+
+ return false;
+}
+
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
+ if (!Subtarget->d16PreservesUnusedBits())
+ return;
+
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ switch (N->getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (MadeChange) {
+ CurDAG->RemoveDeadNodes();
+ LLVM_DEBUG(dbgs() << "After PreProcess:\n";
+ CurDAG->dump(););
+ }
+}
+
bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
if (TM.Options.NoNaNsFPMath)
return true;
@@ -1889,41 +2041,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
return true;
}
-static SDValue stripBitcast(SDValue Val) {
- return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
-}
-
-// Figure out if this is really an extract of the high 16-bits of a dword.
-static bool isExtractHiElt(SDValue In, SDValue &Out) {
- In = stripBitcast(In);
- if (In.getOpcode() != ISD::TRUNCATE)
- return false;
-
- SDValue Srl = In.getOperand(0);
- if (Srl.getOpcode() == ISD::SRL) {
- if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
- if (ShiftAmt->getZExtValue() == 16) {
- Out = stripBitcast(Srl.getOperand(0));
- return true;
- }
- }
- }
-
- return false;
-}
-
-// Look through operations that obscure just looking at the low 16-bits of the
-// same register.
-static SDValue stripExtractLoElt(SDValue In) {
- if (In.getOpcode() == ISD::TRUNCATE) {
- SDValue Src = In.getOperand(0);
- if (Src.getValueType().getSizeInBits() == 32)
- return stripBitcast(Src);
- }
-
- return In;
-}
-
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
@@ -2076,6 +2193,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
return true;
}
+SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
+ if (In.isUndef())
+ return CurDAG->getUNDEF(MVT::i32);
+
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
+ SDLoc SL(In);
+ return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
+ }
+
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
+ SDLoc SL(In);
+ return CurDAG->getConstant(
+ C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
+ }
+
+ SDValue Src;
+ if (isExtractHiElt(In, Src))
+ return Src;
+
+ return SDValue();
+}
+
// TODO: Can we identify things like v_mad_mixhi_f16?
bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const {
if (In.isUndef()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 68171179c46..141b76cc303 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4186,6 +4186,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(INTERP_P1LL_F16)
NODE_NAME_CASE(INTERP_P1LV_F16)
NODE_NAME_CASE(INTERP_P2_F16)
+ NODE_NAME_CASE(LOAD_D16_HI)
+ NODE_NAME_CASE(LOAD_D16_LO)
+ NODE_NAME_CASE(LOAD_D16_HI_I8)
+ NODE_NAME_CASE(LOAD_D16_HI_U8)
+ NODE_NAME_CASE(LOAD_D16_LO_I8)
+ NODE_NAME_CASE(LOAD_D16_LO_U8)
NODE_NAME_CASE(STORE_MSKOR)
NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 359e16cfa56..fa05ee52a8f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -469,6 +469,13 @@ enum NodeType : unsigned {
KILL,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LOAD_D16_HI,
+ LOAD_D16_LO,
+ LOAD_D16_HI_I8,
+ LOAD_D16_HI_U8,
+ LOAD_D16_LO_I8,
+ LOAD_D16_LO_U8,
+
STORE_MSKOR,
LOAD_CONSTANT,
TBUFFER_STORE_FORMAT,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index bdbfdaf1dc1..7b981ea5639 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -802,7 +802,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
SDPatternOperator max_oneuse,
ValueType vt = i32> {
- // This matches 16 permutations of
+ // This matches 16 permutations of
// min(max(a, b), max(min(a, b), c))
def : AMDGPUPat <
(min (max_oneuse vt:$src0, vt:$src1),
@@ -810,7 +810,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst vt:$src0, vt:$src1, vt:$src2)
>;
- // This matches 16 permutations of
+ // This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
def : AMDGPUPat <
(max (min_oneuse vt:$src0, vt:$src1),
@@ -818,7 +818,7 @@ multiclass IntMed3Pat<Instruction med3Inst,
(med3Inst $src0, $src1, $src2)
>;
}
-
+
// Special conversion patterns
def cvt_rpi_i32_f32 : PatFrag <
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 0002e8e51be..91cc44cbd11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -614,6 +614,10 @@ public:
return getGeneration() >= GFX9;
}
+ bool d16PreservesUnusedBits() const {
+ return hasD16LoadStore() && !isSRAMECCEnabled();
+ }
+
/// Return if most LDS instructions have an m0 use that require m0 to be
/// iniitalized.
bool ldsRequiresM0Init() const {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 20cc79ddaed..902cc3e0d4b 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1376,60 +1376,17 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
}
// XXX - Is it possible to have a complex pattern in a PatFrag?
-multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen,
+multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag ld> {
- def : GCNPat <
- (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset)))),
- (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset)))))),
- (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
-
- def : GCNPat <
- (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))),
- (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-
+ ValueType vt, PatFrag ld_frag> {
def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))),
- (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo))
- >;
-}
-
-multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen,
- MUBUF_Pseudo InstrOffset,
- ValueType vt, PatFrag ld> {
- def : GCNPat <
- (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))),
- (vt (Hi16Elt vt:$hi))),
- (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $in)
>;
def : GCNPat <
- (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))))),
- (f16 (Hi16Elt f16:$hi))),
- (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi))
- >;
-
- def : GCNPat <
- (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (vt (Hi16Elt vt:$hi))),
- (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))),
- (f16 (Hi16Elt f16:$hi))),
- (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi))
+ (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $in)
>;
}
@@ -1445,13 +1402,19 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSE
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
let OtherPredicates = [D16PreservesUnusedBits] in {
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
-defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
-
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>;
-defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>;
+
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
}
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d255763d39a..246be340c32 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -611,30 +611,10 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
}
-
-multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
- (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
- >;
-
- def : GCNPat <
- (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
- (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
- >;
-}
-
-multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi))
- >;
-}
+class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
+ (inst $ptr, (as_i16imm $offset), (i1 0), $in)
+>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">;
@@ -656,16 +636,19 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
let OtherPredicates = [D16PreservesUnusedBits] in {
-let AddedComplexity = 100 in {
-defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
-defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
-defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
-
-defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>;
-defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>;
-defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>;
-
-}
+def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
+def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2f16>;
+def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2i16>;
+def : DSReadPat_D16<DS_READ_I8_D16_HI, sextloadi8_d16_hi_local, v2f16>;
+
+def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U16_D16, load_d16_lo_local, v2f16>;
+def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_U8_D16, az_extloadi8_d16_lo_local, v2f16>;
+def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>;
+def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
}
class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 2179b21e0a6..cefcf90d0d2 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -663,53 +663,15 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCN
(inst $vaddr, $offset, 0, $slc)
>;
-multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-
- def : GCNPat <
- (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-}
-
-multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-
- def : GCNPat <
- (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0))
- >;
-}
-
-multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-}
-
-multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> {
- def : GCNPat <
- (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))),
- (v2i16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
+class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc), vt:$in),
+ (inst $vaddr, $offset, 0, $slc, $in)
+>;
- def : GCNPat <
- (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))),
- (v2f16 (inst $vaddr, $offset, 0, $slc, $hi))
- >;
-}
+class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+ (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc), vt:$in),
+ (inst $vaddr, $offset, 0, $slc, $in)
+>;
class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
@@ -817,17 +779,19 @@ let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
-let AddedComplexity = 3 in {
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>;
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>;
-defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>;
-}
-
-let AddedComplexity = 9 in {
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>;
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>;
-defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>;
-}
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>;
+
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
+def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
} // End OtherPredicates = [HasFlatAddressSpace]
@@ -861,14 +825,19 @@ let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>;
-defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>;
-
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>;
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>;
-defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>;
-
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16_HI, load_d16_hi_global, v2f16>;
+
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_d16_lo_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_d16_lo_global, v2f16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
+def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
@@ -902,7 +871,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
-} // End OtherPredicates = [HasFlatGlobalInsts]
+} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8310b42d28b..c5811e47145 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -69,6 +69,13 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
>;
+// load_d16_{lo|hi} ptr, tied_input
+def SIload_d16 : SDTypeProfile<1, 2, [
+ SDTCisPtrTy<1>,
+ SDTCisSameAs<0, 2>
+]>;
+
+
def SDTtbuffer_load : SDTypeProfile<1, 8,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
@@ -187,6 +194,36 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;
+def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_lo_i8 : SDNode<"AMDGPUISD::LOAD_D16_LO_I8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi : SDNode<"AMDGPUISD::LOAD_D16_HI",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi_u8 : SDNode<"AMDGPUISD::LOAD_D16_HI_U8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
+ SIload_d16,
+ [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
//===----------------------------------------------------------------------===//
// ValueType helpers
//===----------------------------------------------------------------------===//
@@ -384,6 +421,51 @@ def si_setcc_uniform : PatFrag <
return true;
}]>;
+//===----------------------------------------------------------------------===//
+// SDNodes PatFrags for d16 loads
+//===----------------------------------------------------------------------===//
+
+class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>;
+class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress;
+class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress;
+class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress;
+class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress;
+
+def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>;
+
+def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>;
+def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>;
+def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>;
+
+
+def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>;
+
+def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>;
+def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>;
+def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>;
+
+
+
def lshr_rev : PatFrag <
(ops node:$src1, node:$src0),
(srl $src0, $src1)
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
index fd81c0438d6..433baf43861 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
@@ -4,9 +4,8 @@
; combine and a generic insert_vector_elt combine.
; GCN-LABEL: {{^}}combine_loop:
-; GCN: flat_load_ushort
+; GCN: flat_load_short_d16_hi
; GCN: flat_store_short
-; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
define amdgpu_kernel void @combine_loop(i16* %arg) #0 {
bb:
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 696b33e75fe..5fd4e065ccd 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
; GCN-LABEL: {{^}}chain_hi_to_lo_private:
; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
@@ -175,3 +175,128 @@ entry:
%loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
ret void
}
+
+; There is another instruction between the misordered instruction and
+; the value dependent load, so a simple operand check is insufficient.
+; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep:
+; GFX900: ds_read_u16_d16_hi v1, v0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: ds_read_u16_d16 v1, v0 offset:2
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, v1
+; GFX900-NEXT: s_setpc_b64
+define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
+bb:
+ %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
+ %load_lo = load i16, i16 addrspace(3)* %gep_lo
+ %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
+ %load_hi = load i16, i16 addrspace(3)* %gep_hi
+ %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
+ %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
+ %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
+ ret <2 x i16> %result
+}
+
+; The volatile operations aren't put on the same chain
+; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep_multi_chain:
+; GFX900: ds_read_u16 v1, v0 offset:2
+; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
+; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v1, v0
+; GFX900-NEXT: s_setpc_b64
+define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
+bb:
+ %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
+ %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
+ %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
+ %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
+ %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
+ %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
+ %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
+ ret <2 x i16> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep:
+; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, v1
+; GFX900-NEXT: s_setpc_b64
+define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
+bb:
+ %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
+ %load_lo = load i16, i16 addrspace(5)* %gep_lo
+ %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
+ %load_hi = load i16, i16 addrspace(5)* %gep_hi
+ %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
+ %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
+ %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
+ ret <2 x i16> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_global_other_dep:
+; GFX900: global_load_ushort v2, v[0:1], off offset:2
+; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off
+; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0
+; GFX900-NEXT: s_setpc_b64
+define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
+bb:
+ %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
+ %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
+ %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
+ %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
+ %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
+ %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
+ %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
+ ret <2 x i16> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_flat_other_dep:
+; GFX900: flat_load_ushort v2, v[0:1] offset:2
+; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0
+; GFX900-NEXT: s_setpc_b64
+define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
+bb:
+ %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
+ %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
+ %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
+ %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
+ %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
+ %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
+ %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
+ ret <2 x i16> %result
+}
+
+; GCN-LABEL: {{^}}chain_hi_to_lo_group_may_alias_store:
+; GFX900: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b
+; GFX900-NEXT: ds_read_u16 v3, v0
+; GFX900-NEXT: ds_write_b16 v1, [[K]]
+; GFX900-NEXT: ds_read_u16 v0, v0 offset:2
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; GFX900-NEXT: s_setpc_b64
+define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
+bb:
+ %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
+ %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
+ %load_hi = load i16, i16 addrspace(3)* %gep_hi
+ store i16 123, i16 addrspace(3)* %may.alias
+ %load_lo = load i16, i16 addrspace(3)* %gep_lo
+
+ %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
+ %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
+ ret <2 x i16> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index ee5737c2065..357ce3d9a9a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -880,6 +880,21 @@ entry:
ret <2 x i16> %build1
}
+; FIXME: Remove and
+; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
+; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
+; GCN-NOT: ds_read
+; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
+; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
+define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
+ %load0 = load i16, i16 addrspace(3)* %in
+ %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
+ ret <2 x i16> %build1
+}
+
; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
; GFX900: ds_write_b16
OpenPOWER on IntegriCloud