diff options
author | David Stuttard <david.stuttard@amd.com> | 2019-01-14 11:55:24 +0000 |
---|---|---|
committer | David Stuttard <david.stuttard@amd.com> | 2019-01-14 11:55:24 +0000 |
commit | f77079f892548efa3b34c16233d8779d25d92f58 (patch) | |
tree | 1c3da48a6063defe55051d3955d1a524108ae682 /llvm/lib/Target/AMDGPU/SIISelLowering.cpp | |
parent | d1986d1b5a671bb73481f595a9d7e130b11f0d55 (diff) | |
download | bcm5719-llvm-f77079f892548efa3b34c16233d8779d25d92f58.tar.gz bcm5719-llvm-f77079f892548efa3b34c16233d8779d25d92f58.zip |
[AMDGPU] Add support for TFE/LWE in image intrinsics. 2nd try
TFE and LWE support requires extra result registers that are written in the
event of a failure in order to detect that failure case.
The specific use-case that initiated these changes is sparse texture support.
This means that if image intrinsics are used with either option turned on, the
programmer must ensure that the return type can contain all of the expected
results. This can result in redundant registers since the vector size must be a
power-of-2.
This change takes roughly 6 parts:
1. Modify the instruction defs in tablegen to add new instruction variants that
can accomodate the extra return values.
2. Updates to lowerImage in SIISelLowering.cpp to accomodate setting TFE or LWE
(where the bulk of the work for these instruction types is now done)
3. Extra verification code to catch cases where intrinsics have been used but
insufficient return registers are used.
4. Modification to the adjustWritemask optimisation to account for TFE/LWE being
enabled (requires extra registers to be maintained for error return value).
5. An extra pass to zero initialize the error value return - this is because if
the error does not occur, the register is not written and thus must be zeroed
before use. Also added a new (on by default) option to ensure ALL return values
are zero-initialized that is required for sparse texture support.
6. Disable the inst_combine optimization in the presence of tfe/lwe (later TODO
for this to re-enable and handle correctly).
There's an additional fix now to avoid a dmask=0
For an image intrinsic with tfe where all result channels except tfe
were unused, I was getting an image instruction with dmask=0 and only a
single vgpr result for tfe. That is incorrect because the hardware
assumes there is at least one vgpr result, plus the one for tfe.
Fixed by forcing dmask to 1, which gives the desired two vgpr result
with tfe in the second one.
The TFE or LWE result is returned from the intrinsics using an aggregate
type. Look in the test code provided to see how this works, but in essence IR
code to invoke the intrinsic looks as follows:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15,
i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue {<4 x float>, i32} %v, 0
%v.err = extractvalue {<4 x float>, i32} %v, 1
This re-submit of the change also includes a slight modification in
SIISelLowering.cpp to work-around a compiler bug for the powerpc_le
platform that caused a buildbot failure on a previous submission.
Differential revision: https://reviews.llvm.org/D48826
Change-Id: If222bc03642e76cf98059a6bef5d5bffeda38dda
Work around for ppcle compiler bug
Change-Id: Ie284cf24b2271215be1b9dc95b485fd15000e32b
llvm-svn: 351054
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 342 |
1 files changed, 289 insertions, 53 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6374792fee8..9e167378d09 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -216,6 +216,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -813,6 +814,47 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } +static MVT memVTFromAggregate(Type *Ty) { + // Only limited forms of aggregate type currently expected. + assert(Ty->isStructTy() && "Expected struct type"); + + + Type *ElementType = nullptr; + unsigned NumElts; + if (Ty->getContainedType(0)->isVectorTy()) { + VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0)); + ElementType = VecComponent->getElementType(); + NumElts = VecComponent->getNumElements(); + } else { + ElementType = Ty->getContainedType(0); + NumElts = 1; + } + + assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type"); + + // Calculate the size of the memVT type from the aggregate + unsigned Pow2Elts = 0; + unsigned ElementSize; + switch (ElementType->getTypeID()) { + default: + llvm_unreachable("Unknown type!"); + case Type::IntegerTyID: + ElementSize = cast<IntegerType>(ElementType)->getBitWidth(); + break; + case Type::HalfTyID: + ElementSize = 16; + break; + case Type::FloatTyID: + ElementSize = 32; + break; + } + unsigned AdditionalElts = ElementSize == 16 ? 2 : 1; + Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts); + + return MVT::getVectorVT(MVT::getVT(ElementType, false), + Pow2Elts); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -840,7 +882,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MODereferenceable; if (Attr.hasFnAttribute(Attribute::ReadOnly)) { Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); + Info.memVT = MVT::getVT(CI.getType(), true); + if (Info.memVT == MVT::Other) { + // Some intrinsics return an aggregate type - special case to work out + // the correct memVT + Info.memVT = memVTFromAggregate(CI.getType()); + } Info.flags |= MachineMemOperand::MOLoad; } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; @@ -4613,6 +4660,109 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, return Value == 0; } +// Re-construct the required return value for a image load intrinsic. +// This is more complicated due to the optional use TexFailCtrl which means the required +// return type is an aggregate +static SDValue constructRetValue(SelectionDAG &DAG, + MachineSDNode *Result, + ArrayRef<EVT> ResultTypes, + bool IsTexFail, bool Unpacked, bool IsD16, + int DMaskPop, int NumVDataDwords, + const SDLoc &DL, LLVMContext &Context) { + // Determine the required return type. This is the same regardless of IsTexFail flag + EVT ReqRetVT = ResultTypes[0]; + EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT; + int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; + EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT; + EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts) + : AdjEltVT + : ReqRetVT; + + // Extract data part of the result + // Bitcast the result to the same type as the required return type + int NumElts; + if (IsD16 && !Unpacked) + NumElts = NumVDataDwords << 1; + else + NumElts = NumVDataDwords; + + EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) + : AdjEltVT; + + // Special case for v8f16. Rather than add support for this, use v4i32 to + // extract the data elements + bool V8F16Special = false; + if (CastVT == MVT::v8f16) { + CastVT = MVT::v4i32; + DMaskPop >>= 1; + ReqRetNumElts >>= 1; + V8F16Special = true; + AdjVT = MVT::v2i32; + } + + SDValue N = SDValue(Result, 0); + SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N); + + // Iterate over the result + SmallVector<SDValue, 4> BVElts; + + if (CastVT.isVector()) { + DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop); + } else { + BVElts.push_back(CastRes); + } + int ExtraElts = ReqRetNumElts - DMaskPop; + while(ExtraElts--) + BVElts.push_back(DAG.getUNDEF(AdjEltVT)); + + SDValue PreTFCRes; + if (ReqRetNumElts > 1) { + SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts); + if (IsD16 && Unpacked) + PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked); + else + PreTFCRes = NewVec; + } else { + PreTFCRes = BVElts[0]; + } + + if (V8F16Special) + PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); + + if (!IsTexFail) { + if (Result->getNumValues() > 1) + return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL); + else + return PreTFCRes; + } + + // Extract the TexFail result and insert into aggregate return + SmallVector<SDValue, 1> TFCElt; + DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1); + SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]); + return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL); +} + +static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, + SDValue *LWE, bool &IsTexFail) { + auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode()); + if (!TexFailCtrlConst) + return false; + + uint64_t Value = TexFailCtrlConst->getZExtValue(); + if (Value) { + IsTexFail = true; + } + + SDLoc DL(TexFailCtrlConst); + *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x1; + *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x2; + + return Value == 0; +} + SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { @@ -4626,13 +4776,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; - SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; bool IsA16 = false; SDValue VData; int NumVDataDwords; + bool AdjustRetType = false; + unsigned AddrIdx; // Index of first address argument unsigned DMask; + unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { VData = Op.getOperand(2); @@ -4655,7 +4809,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AddrIdx = 3; } } else { - unsigned DMaskIdx; + unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1; + auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); + if (!DMaskConst) + return Op; + DMask = DMaskConst->getZExtValue(); + DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); if (BaseOpcode->Store) { VData = Op.getOperand(2); @@ -4671,37 +4830,32 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; - DMaskIdx = 3; } else { - MVT LoadVT = Op.getSimpleValueType(); + // Work out the num dwords based on the dmask popcount and underlying type + // and whether packing is supported. + MVT LoadVT = ResultTypes[0].getSimpleVT(); if (LoadVT.getScalarType() == MVT::f16) { if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; - if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem()) - ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; } - NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32; - DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1; - } + // Confirm that the return type is large enough for the dmask specified + if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || + (!LoadVT.isVector() && DMaskLanes > 1)) + return Op; - auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); - if (!DMaskConst) - return Op; + if (IsD16 && !Subtarget->hasUnpackedD16VMem()) + NumVDataDwords = (DMaskLanes + 1) / 2; + else + NumVDataDwords = DMaskLanes; - AddrIdx = DMaskIdx + 1; - DMask = DMaskConst->getZExtValue(); - if (!DMask && !BaseOpcode->Store) { - // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they - // store the channels' default values. - SDValue Undef = DAG.getUNDEF(Op.getValueType()); - if (isa<MemSDNode>(Op)) - return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); - return Undef; + AdjustRetType = true; } + + AddrIdx = DMaskIdx + 1; } unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; @@ -4780,11 +4934,53 @@ SDValue SITargetLowering::lowerImage(SDValue Op, CtrlIdx = AddrIdx + NumVAddrs + 3; } + SDValue TFE; + SDValue LWE; SDValue TexFail = Op.getOperand(CtrlIdx); - auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode()); - if (!TexFailConst || TexFailConst->getZExtValue() != 0) + bool IsTexFail = false; + if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) return Op; + if (IsTexFail) { + if (!DMaskLanes) { + // Expecting to get an error flag since TFC is on - and dmask is 0 + // Force dmask to be at least 1 otherwise the instruction will fail + DMask = 0x1; + DMaskLanes = 1; + NumVDataDwords = 1; + } + NumVDataDwords += 1; + AdjustRetType = true; + } + + // Has something earlier tagged that the return type needs adjusting + // This happens if the instruction is a load or has set TexFailCtrl flags + if (AdjustRetType) { + // NumVDataDwords reflects the true number of dwords required in the return type + if (DMaskLanes == 0 && !BaseOpcode->Store) { + // This is a no-op load. This can be eliminated + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + if (isa<MemSDNode>(Op)) + return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); + return Undef; + } + + // Have to use a power of 2 number of dwords + NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); + + EVT NewVT = NumVDataDwords > 1 ? + EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) + : MVT::f32; + + ResultTypes[0] = NewVT; + if (ResultTypes.size() == 3) { + // Original result was aggregate type used for TexFailCtrl results + // The actual instruction returns as a vector type which has now been + // created. Remove the aggregate result. + ResultTypes.erase(&ResultTypes[1]); + } + } + SDValue GLC; SDValue SLC; if (BaseOpcode->Atomic) { @@ -4809,8 +5005,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(SLC); Ops.push_back(IsA16 && // a16 or r128 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); - Ops.push_back(False); // tfe - Ops.push_back(False); // lwe + Ops.push_back(TFE); // tfe + Ops.push_back(LWE); // lwe Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); @@ -4838,11 +5034,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); - } else if (IsD16 && !BaseOpcode->Store) { - MVT LoadVT = Op.getSimpleValueType(); - SDValue Adjusted = adjustLoadValueTypeImpl( - SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem()); - return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL); + } else if (!BaseOpcode->Store) { + return constructRetValue(DAG, NewNode, + OrigResultTypes, IsTexFail, + Subtarget->hasUnpackedD16VMem(), IsD16, + DMaskLanes, NumVDataDwords, DL, + *DAG.getContext()); } return SDValue(NewNode, 0); @@ -8753,6 +8950,7 @@ static unsigned SubIdx2Lane(unsigned Idx) { case AMDGPU::sub1: return 1; case AMDGPU::sub2: return 2; case AMDGPU::sub3: return 3; + case AMDGPU::sub4: return 4; // Possible with TFE/LWE } } @@ -8766,11 +8964,16 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) return Node; // not implemented for D16 - SDNode *Users[4] = { nullptr }; + SDNode *Users[5] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; + unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; + unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; + bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) || + Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; + unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; if (OldDmask == 0) { @@ -8778,6 +8981,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, return Node; } + unsigned OldBitsSet = countPopulation(OldDmask); + // Work out which is the TFE/LWE lane if that is enabled. + if (UsesTFC) { + TFCLane = OldBitsSet; + } + // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; ++I) { @@ -8797,28 +9006,49 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } + // Check if the use is for the TFE/LWE generated result at VGPRn+1. + if (UsesTFC && Lane == TFCLane) { + Users[Lane] = *I; + } else { + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } - // Abort if we have more than one user per component - if (Users[Lane]) - return Node; + // Abort if we have more than one user per component. + if (Users[Lane]) + return Node; - Users[Lane] = *I; - NewDmask |= 1 << Comp; + Users[Lane] = *I; + NewDmask |= 1 << Comp; + } } + // Don't allow 0 dmask, as hardware assumes one channel enabled. + bool NoChannels = !NewDmask; + if (NoChannels) { + // If the original dmask has one channel - then nothing to do + if (OldBitsSet == 1) + return Node; + // Use an arbitrary dmask - required for the instruction to work + NewDmask = 1; + } // Abort if there's no change if (NewDmask == OldDmask) return Node; unsigned BitsSet = countPopulation(NewDmask); - int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); + // Check for TFE or LWE - increase the number of channels by one to account + // for the extra return value + // This will need adjustment for D16 if this is also included in + // adjustWriteMask (this function) but at present D16 are excluded. + unsigned NewChannels = BitsSet + UsesTFC; + + int NewOpcode = + AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); assert(NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"); @@ -8831,8 +9061,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); - MVT ResultVT = BitsSet == 1 ? - SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); + MVT ResultVT = NewChannels == 1 ? + SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 : + NewChannels == 5 ? 8 : NewChannels); SDVTList NewVTList = HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); @@ -8846,7 +9077,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); } - if (BitsSet == 1) { + if (NewChannels == 1) { assert(Node->hasNUsesOfValue(1, 0)); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), Users[Lane]->getValueType(0), @@ -8856,19 +9087,24 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, } // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { SDNode *User = Users[i]; - if (!User) - continue; - - SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + if (!User) { + // Handle the special case of NoChannels. We set NewDmask to 1 above, but + // Users[0] is still nullptr because channel 0 doesn't really have a use. + if (i || !NoChannels) + continue; + } else { + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); + DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + } switch (Idx) { default: break; case AMDGPU::sub0: Idx = AMDGPU::sub1; break; case AMDGPU::sub1: Idx = AMDGPU::sub2; break; case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + case AMDGPU::sub3: Idx = AMDGPU::sub4; break; } } |