diff options
| author | David Stuttard <david.stuttard@amd.com> | 2018-11-29 20:14:17 +0000 |
|---|---|---|
| committer | David Stuttard <david.stuttard@amd.com> | 2018-11-29 20:14:17 +0000 |
| commit | c6603861d8bad3054ed137b140742eb15abcd3ce (patch) | |
| tree | c663eeb366bb7493cc0160c29fb8e9e31951b8e7 | |
| parent | eba2365f23db0cae29e9a187ec16bb64e49be5d6 (diff) | |
| download | bcm5719-llvm-c6603861d8bad3054ed137b140742eb15abcd3ce.tar.gz bcm5719-llvm-c6603861d8bad3054ed137b140742eb15abcd3ce.zip | |
Revert r347871 "Fix: Add support for TFE/LWE in image intrinsic"
Also revert fix r347876
One of the buildbots was reporting a failure in some relevant tests that I can't
repro or explain at present, so reverting until I can isolate.
llvm-svn: 347911
20 files changed, 76 insertions, 1274 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 963a10b1916..67e7da7797a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -590,7 +590,7 @@ class AMDGPUDimSampleProfile<string opmod, AMDGPUDimProps dim, AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { let IsSample = 1; - let RetTypes = [llvm_any_ty]; + let RetTypes = [llvm_anyfloat_ty]; let ExtraAddrArgs = sample.ExtraAddrArgs; let Gradients = sample.Gradients; let LodClampMip = sample.LodOrClamp; @@ -683,11 +683,11 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { } defm int_amdgcn_image_load - : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], + : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand]>, AMDGPUImageDMaskIntrinsic; defm int_amdgcn_image_load_mip - : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], + : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [], [IntrReadMem], [SDNPMemOperand], 1>, AMDGPUImageDMaskIntrinsic; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ea64b125b3b..07ae2bee49b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -42,7 +42,6 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIFixupVectorISelPass(); -FunctionPass *createSIAddIMGInitPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -154,9 +153,6 @@ extern char &AMDGPUSimplifyLibCallsID; void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; -void initializeSIAddIMGInitPass(PassRegistry &); -extern char &SIAddIMGInitID; - void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); extern char &AMDGPUPerfHintAnalysisID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8f20c407ab8..23470c7a4d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -367,16 +367,6 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128", "Use ds_{read|write}_b128" >; -// Sparse texture support requires that all result registers are zeroed when -// PRTStrictNull is set to true. This feature is turned on for all architectures -// but is enabled as a feature in case there are situations where PRTStrictNull -// is disabled by the driver. -def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null", - "EnablePRTStrictNull", - "true", - "Enable zeroing of result registers for sparse texture fetches" ->; - // Unless +-flat-for-global is specified, turn on FlatForGlobal for // all OS-es on VI and newer hardware to avoid assertion failures due // to missing ADDR64 variants of MUBUF instructions. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1754ead2538..f1acd72b03a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -74,9 +74,6 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // We want to be able to turn these off, but making this a subtarget feature // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - // - // Similarly we want enable-prt-strict-null to be on by default and not to - // unset everything else if it is disabled SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); @@ -92,8 +89,6 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "-fp32-denormals,"; } - FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS - FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -180,7 +175,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), EnableDS128(false), - EnablePRTStrictNull(false), DumpCode(false), FP64(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index f6b176fe604..886aca42b6c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -326,7 +326,6 @@ protected: bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; bool EnableDS128; - bool EnablePRTStrictNull; bool DumpCode; // Subtarget statically properties set by tablegen @@ -577,12 +576,6 @@ public: return getGeneration() < AMDGPUSubtarget::GFX9; } - /// \returns If target requires PRT Struct NULL support (zero result registers - /// for sparse texture support). - bool usePRTStrictNull() const { - return EnablePRTStrictNull; - } - bool hasAutoWaitcntBeforeBarrier() const { return AutoWaitcntBeforeBarrier; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c2e2129f5de..2198ba8d6c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -815,7 +815,6 @@ bool GCNPassConfig::addInstSelector() { addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); addPass(createSIFixupVectorISelPass()); - addPass(createSIAddIMGInitPass()); return false; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index dd1b8532aae..bb1096bc1de 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -93,7 +93,6 @@ add_llvm_target(AMDGPUCodeGen R600OptimizeVectorRegisters.cpp R600Packetizer.cpp R600RegisterInfo.cpp - SIAddIMGInit.cpp SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp SIFixSGPRCopies.cpp diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 1c68dbd78e7..1462682e761 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -29,7 +29,6 @@ class MIMGBaseOpcode { bit Atomic = 0; bit AtomicX2 = 0; // (f)cmpswap bit Sampler = 0; - bit Gather4 = 0; bits<8> NumExtraArgs = 0; bit Gradients = 0; bit Coordinates = 1; @@ -44,7 +43,7 @@ def MIMGBaseOpcode : GenericEnum { def MIMGBaseOpcodesTable : GenericTable { let FilterClass = "MIMGBaseOpcode"; let CppTypeName = "MIMGBaseOpcodeInfo"; - let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", + let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip", "HasD16"]; GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; @@ -180,8 +179,6 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0, defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>; - let VDataDwords = 8 in - defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>; } } @@ -414,8 +411,6 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; } } @@ -426,7 +421,6 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, string asm = "image_gather4"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode<sample> { let HasD16 = 1; - let Gather4 = 1; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, @@ -435,8 +429,6 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; } } diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp deleted file mode 100644 index 69cafef4a35..00000000000 --- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ /dev/null @@ -1,181 +0,0 @@ -//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Any MIMG instructions that use tfe or lwe require an initialization of the -/// result register that will be written in the case of a memory access failure -/// The required code is also added to tie this init code to the result of the -/// img instruction -/// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetMachine.h" - -#define DEBUG_TYPE "si-img-init" - -using namespace llvm; - -namespace { - -class SIAddIMGInit : public MachineFunctionPass { -public: - static char ID; - -public: - SIAddIMGInit() : MachineFunctionPass(ID) { - initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false) - -char SIAddIMGInit::ID = 0; - -char &llvm::SIAddIMGInitID = SIAddIMGInit::ID; - -FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); } - -bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *RI = ST.getRegisterInfo(); - bool Changed = false; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; - ++BI) { - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - auto Opcode = MI.getOpcode(); - if (TII->isMIMG(Opcode) && !MI.mayStore()) { - MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); - MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); - MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); - - // Check for instructions that don't have tfe or lwe fields - // There shouldn't be any at this point. - assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction"); - - unsigned TFEVal = TFE->getImm(); - unsigned LWEVal = LWE->getImm(); - unsigned D16Val = D16 ? D16->getImm() : 0; - - if (TFEVal || LWEVal) { - // At least one of TFE or LWE are non-zero - // We have to insert a suitable initialization of the result value and - // tie this to the dest of the image instruction. - - const DebugLoc &DL = MI.getDebugLoc(); - - int DstIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); - - // Calculate which dword we have to initialize to 0. - MachineOperand *MO_Dmask = - TII->getNamedOperand(MI, AMDGPU::OpName::dmask); - - // check that dmask operand is found. - assert(MO_Dmask && "Expected dmask operand in instruction"); - - unsigned dmask = MO_Dmask->getImm(); - // Determine the number of active lanes taking into account the - // Gather4 special case - unsigned ActiveLanes = - TII->isGather4(Opcode) ? 4 : countPopulation(dmask); - - // Subreg indices are counted from 1 - // When D16 then we want next whole VGPR after write data. - static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected"); - - bool Packed = !ST.hasUnpackedD16VMem(); - - unsigned InitIdx = - D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; - - // Abandon attempt if the dst size isn't large enough - // - this is in fact an error but this is picked up elsewhere and - // reported correctly. - uint32_t DstSize = - RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; - if (DstSize < InitIdx) - continue; - - // Create a register for the intialization value. - unsigned PrevDst = - MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); - unsigned NewDst = 0; // Final initialized value will be in here - - // If PRTStrictNull feature is enabled (the default) then initialize - // all the result registers to 0, otherwise just the error indication - // register (VGPRn+1) - unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1; - unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx; - - if (DstSize == 1) { - // In this case we can just initialize the result directly - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst) - .addImm(0); - NewDst = PrevDst; - } else { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); - for (; SizeLeft; SizeLeft--, CurrIdx++) { - NewDst = - MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); - // Initialize dword - unsigned SubReg = - MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) - .addImm(0); - // Insert into the super-reg - BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) - .addReg(PrevDst) - .addReg(SubReg) - .addImm(CurrIdx); - - PrevDst = NewDst; - } - } - - // Add as an implicit operand - MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit); - - // Tie the just added implicit operand to the dst - MI.tieOperands(DstIdx, MI.getNumOperands() - 1); - - Changed = true; - } - } - } - } - - return Changed; -} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1f53af1b505..9f5198042e4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -216,7 +216,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -814,47 +813,6 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } -static MVT memVTFromAggregate(Type *Ty) { - // Only limited forms of aggregate type currently expected. - assert(Ty->isStructTy() && "Expected struct type"); - - - Type *ElementType = nullptr; - unsigned NumElts; - if (Ty->getContainedType(0)->isVectorTy()) { - VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0)); - ElementType = VecComponent->getElementType(); - NumElts = VecComponent->getNumElements(); - } else { - ElementType = Ty->getContainedType(0); - NumElts = 1; - } - - assert(Ty->getContainedType(1)->isIntegerTy(32) && "Expected int32 type"); - - // Calculate the size of the memVT type from the aggregate - unsigned Pow2Elts = 0; - unsigned ElementSize; - switch (ElementType->getTypeID()) { - default: - llvm_unreachable("Unknown type!"); - case Type::IntegerTyID: - ElementSize = cast<IntegerType>(ElementType)->getBitWidth(); - break; - case Type::HalfTyID: - ElementSize = 16; - break; - case Type::FloatTyID: - ElementSize = 32; - break; - } - unsigned AdditionalElts = ElementSize == 16 ? 2 : 1; - Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts); - - return MVT::getVectorVT(MVT::getVT(ElementType, false), - Pow2Elts); -} - bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -882,12 +840,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MODereferenceable; if (Attr.hasFnAttribute(Attribute::ReadOnly)) { Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType(), true); - if (Info.memVT == MVT::Other) { - // Some intrinsics return an aggregate type - special case to work out - // the correct memVT - Info.memVT = memVTFromAggregate(CI.getType()); - } + Info.memVT = MVT::getVT(CI.getType()); Info.flags |= MachineMemOperand::MOLoad; } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; @@ -4660,109 +4613,6 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, return Value == 0; } -// Re-construct the required return value for a image load intrinsic. -// This is more complicated due to the optional use TexFailCtrl which means the required -// return type is an aggregate -static SDValue constructRetValue(SelectionDAG &DAG, - MachineSDNode *Result, - ArrayRef<EVT> ResultTypes, - bool IsTexFail, bool Unpacked, bool IsD16, - int DMaskPop, int NumVDataDwords, - const SDLoc &DL, LLVMContext &Context) { - // Determine the required return type. This is the same regardless of IsTexFail flag - EVT ReqRetVT = ResultTypes[0]; - EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT; - int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; - EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT; - EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts) - : AdjEltVT - : ReqRetVT; - - // Extract data part of the result - // Bitcast the result to the same type as the required return type - int NumElts; - if (IsD16 && !Unpacked) - NumElts = NumVDataDwords << 1; - else - NumElts = NumVDataDwords; - - EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) - : AdjEltVT; - - // Special case for v8f16. Rather than add support for this, use v4i32 to - // extract the data elements - bool V8F16Special = false; - if (CastVT == MVT::v8f16) { - CastVT = MVT::v4i32; - DMaskPop >>= 1; - ReqRetNumElts >>= 1; - V8F16Special = true; - AdjVT = MVT::v2i32; - } - - SDValue N = SDValue(Result, 0); - SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N); - - // Iterate over the result - SmallVector<SDValue, 4> BVElts; - - if (CastVT.isVector()) { - DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop); - } else { - BVElts.push_back(CastRes); - } - int ExtraElts = ReqRetNumElts - DMaskPop; - while(ExtraElts--) - BVElts.push_back(DAG.getUNDEF(AdjEltVT)); - - SDValue PreTFCRes; - if (ReqRetNumElts > 1) { - SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts); - if (IsD16 && Unpacked) - PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked); - else - PreTFCRes = NewVec; - } else { - PreTFCRes = BVElts[0]; - } - - if (V8F16Special) - PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); - - if (!IsTexFail) { - if (Result->getNumValues() > 1) - return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL); - else - return PreTFCRes; - } - - // Extract the TexFail result and insert into aggregate return - SmallVector<SDValue, 1> TFCElt; - DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1); - SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]); - return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL); -} - -static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, - SDValue *LWE, bool &IsTexFail) { - auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode()); - if (!TexFailCtrlConst) - return false; - - uint64_t Value = TexFailCtrlConst->getZExtValue(); - if (Value) { - IsTexFail = true; - } - - SDLoc DL(TexFailCtrlConst); - *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x1; - *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x2; - - return Value == 0; -} - SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { @@ -4776,17 +4626,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; - SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); - SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; bool IsA16 = false; SDValue VData; int NumVDataDwords; - bool AdjustRetType = false; - unsigned AddrIdx; // Index of first address argument unsigned DMask; - unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { VData = Op.getOperand(2); @@ -4809,12 +4655,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AddrIdx = 3; } } else { - unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1; - auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); - if (!DMaskConst) - return Op; - DMask = DMaskConst->getZExtValue(); - DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); + unsigned DMaskIdx; if (BaseOpcode->Store) { VData = Op.getOperand(2); @@ -4830,32 +4671,37 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; + DMaskIdx = 3; } else { - // Work out the num dwords based on the dmask popcount and underlying type - // and whether packing is supported. - MVT LoadVT = ResultTypes[0].getSimpleVT(); + MVT LoadVT = Op.getSimpleValueType(); if (LoadVT.getScalarType() == MVT::f16) { if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; + if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem()) + ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; } - // Confirm that the return type is large enough for the dmask specified - if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || - (!LoadVT.isVector() && DMaskLanes > 1)) - return Op; - - if (IsD16 && !Subtarget->hasUnpackedD16VMem()) - NumVDataDwords = (DMaskLanes + 1) / 2; - else - NumVDataDwords = DMaskLanes; - - AdjustRetType = true; + NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32; + DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1; } + auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); + if (!DMaskConst) + return Op; + AddrIdx = DMaskIdx + 1; + DMask = DMaskConst->getZExtValue(); + if (!DMask && !BaseOpcode->Store) { + // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they + // store the channels' default values. + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + if (isa<MemSDNode>(Op)) + return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); + return Undef; + } } unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; @@ -4934,53 +4780,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op, CtrlIdx = AddrIdx + NumVAddrs + 3; } - SDValue TFE; - SDValue LWE; SDValue TexFail = Op.getOperand(CtrlIdx); - bool IsTexFail = false; - if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) + auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode()); + if (!TexFailConst || TexFailConst->getZExtValue() != 0) return Op; - if (IsTexFail) { - if (!NumVDataDwords) { - // Expecting to get an error flag since TFC is on - and dmask is 0 - // Force dmask to be at least 1 otherwise the instruction will fail - DMask = 0x1; - DMaskLanes = 1; - NumVDataDwords = 1; - } - NumVDataDwords += 1; - AdjustRetType = true; - } - - // Has something earlier tagged that the return type needs adjusting - // This happens if the instruction is a load or has set TexFailCtrl flags - if (AdjustRetType) { - // NumVDataDwords reflects the true number of dwords required in the return type - if (NumVDataDwords == 0 && !BaseOpcode->Store) { - // This is a no-op load. This can be eliminated - SDValue Undef = DAG.getUNDEF(Op.getValueType()); - if (isa<MemSDNode>(Op)) - return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); - return Undef; - } - - // Have to use a power of 2 number of dwords - NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); - - EVT NewVT = NumVDataDwords > 1 ? - EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) - : MVT::f32; - - ResultTypes[0] = NewVT; - if (ResultTypes.size() == 3) { - // Original result was aggregate type used for TexFailCtrl results - // The actual instruction returns as a vector type which has now been - // created. Remove the aggregate result. - ResultTypes.erase(&ResultTypes[1]); - } - } - SDValue GLC; SDValue SLC; if (BaseOpcode->Atomic) { @@ -5005,8 +4809,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(SLC); Ops.push_back(IsA16 && // a16 or r128 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); - Ops.push_back(TFE); // tfe - Ops.push_back(LWE); // lwe + Ops.push_back(False); // tfe + Ops.push_back(False); // lwe Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); @@ -5034,12 +4838,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); - } else if (!BaseOpcode->Store) { - return constructRetValue(DAG, NewNode, - OrigResultTypes, IsTexFail, - Subtarget->hasUnpackedD16VMem(), IsD16, - DMaskLanes, NumVDataDwords, DL, - *DAG.getContext()); + } else if (IsD16 && !BaseOpcode->Store) { + MVT LoadVT = Op.getSimpleValueType(); + SDValue Adjusted = adjustLoadValueTypeImpl( + SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem()); + return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL); } return SDValue(NewNode, 0); @@ -8969,7 +8772,6 @@ static unsigned SubIdx2Lane(unsigned Idx) { case AMDGPU::sub1: return 1; case AMDGPU::sub2: return 2; case AMDGPU::sub3: return 3; - case AMDGPU::sub4: return 4; // Possible with TFE/LWE } } @@ -8983,16 +8785,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) return Node; // not implemented for D16 - SDNode *Users[5] = { nullptr }; + SDNode *Users[4] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; - unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; - unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; - bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) || - Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; - unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; if (OldDmask == 0) { @@ -9000,12 +8797,6 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, return Node; } - unsigned OldBitsSet = countPopulation(OldDmask); - // Work out which is the TFE/LWE lane if that is enabled. - if (UsesTFC) { - TFCLane = OldBitsSet; - } - // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; ++I) { @@ -9025,49 +8816,28 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - // Check if the use is for the TFE/LWE generated result at VGPRn+1. - if (UsesTFC && Lane == TFCLane) { - Users[Lane] = *I; - } else { - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } - - // Abort if we have more than one user per component. - if (Users[Lane]) - return Node; - - Users[Lane] = *I; - NewDmask |= 1 << Comp; + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); } - } - // Don't allow 0 dmask, as hardware assumes one channel enabled. - bool NoChannels = !NewDmask; - if (NoChannels) { - // If the original dmask has one channel - then nothing to do - if (OldBitsSet == 1) + // Abort if we have more than one user per component + if (Users[Lane]) return Node; - // Use an arbitrary dmask - required for the instruction to work - NewDmask = 1; + + Users[Lane] = *I; + NewDmask |= 1 << Comp; } + // Abort if there's no change if (NewDmask == OldDmask) return Node; unsigned BitsSet = countPopulation(NewDmask); - // Check for TFE or LWE - increase the number of channels by one to account - // for the extra return value - // This will need adjustment for D16 if this is also included in - // adjustWriteMask (this function) but at present D16 are excluded. - unsigned NewChannels = BitsSet + UsesTFC; - - int NewOpcode = - AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); + int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); assert(NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"); @@ -9080,9 +8850,8 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); - MVT ResultVT = NewChannels == 1 ? - SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 : - NewChannels == 5 ? 8 : NewChannels); + MVT ResultVT = BitsSet == 1 ? + SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); SDVTList NewVTList = HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); @@ -9096,7 +8865,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); } - if (NewChannels == 1) { + if (BitsSet == 1) { assert(Node->hasNUsesOfValue(1, 0)); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), Users[Lane]->getValueType(0), @@ -9106,24 +8875,19 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, } // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { SDNode *User = Users[i]; - if (!User) { - // Handle the special case of NoChannels. We set NewDmask to 1 above, but - // Users[0] is still nullptr because channel 0 doesn't really have a use. - if (i || !NoChannels) - continue; - } else { - SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); - } + if (!User) + continue; + + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); + DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); switch (Idx) { default: break; case AMDGPU::sub0: Idx = AMDGPU::sub1; break; case AMDGPU::sub1: Idx = AMDGPU::sub2; break; case AMDGPU::sub2: Idx = AMDGPU::sub3; break; - case AMDGPU::sub3: Idx = AMDGPU::sub4; break; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5d2ca05ec5b..580ceed8b8d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2968,42 +2968,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - // Verify MIMG - if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { - // Ensure that the return type used is large enough for all the options - // being used TFE/LWE require an extra result register. - const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); - if (DMask) { - uint64_t DMaskImm = DMask->getImm(); - uint32_t RegCount = - isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); - const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); - const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); - const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); - - // Adjust for packed 16 bit values - if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) - RegCount >>= 1; - - // Adjust if using LWE or TFE - if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) - RegCount += 1; - - const uint32_t DstIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); - const MachineOperand &Dst = MI.getOperand(DstIdx); - if (Dst.isReg()) { - const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); - uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; - if (RegCount > DstSize) { - ErrInfo = "MIMG instruction returns too many registers for dst " - "register class"; - return false; - } - } - } - } - // Verify VOP*. Ignore multiple sgpr operands on writelane. if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 2a0416e45cf..4bf16f59621 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -184,7 +184,6 @@ struct MIMGBaseOpcodeInfo { bool Atomic; bool AtomicX2; bool Sampler; - bool Gather4; uint8_t NumExtraArgs; bool Gradients; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 512e1a89b77..07e21225450 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -802,8 +802,7 @@ private: Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, APInt DemandedElts, - int DmaskIdx = -1, - int TFCIdx = -1); + int DmaskIdx = -1); Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth = 0); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 68d980fca9d..a193dde1c39 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -969,24 +969,11 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, APInt DemandedElts, - int DMaskIdx, - int TFCIdx) { + int DMaskIdx) { unsigned VWidth = II->getType()->getVectorNumElements(); if (VWidth == 1) return nullptr; - // Need to change to new instruction format - ConstantInt *TFC = nullptr; - bool TFELWEEnabled = false; - if (TFCIdx > 0) { - TFC = dyn_cast<ConstantInt>(II->getArgOperand(TFCIdx)); - TFELWEEnabled = TFC->getZExtValue() & 0x1 // TFE - || TFC->getZExtValue() & 0x2; // LWE - } - - if (TFELWEEnabled) - return nullptr; // TFE not yet supported - ConstantInt *NewDMask = nullptr; if (DMaskIdx < 0) { @@ -1635,8 +1622,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts); default: { if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) - return simplifyAMDGCNMemoryIntrinsicDemanded( - II, DemandedElts, 0, II->getNumArgOperands() - 2); + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0); break; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index b297acab36c..bf93ffa937a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -1,7 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SIVI,PRT %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,PRT %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,PRT %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,NOPRT %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; GCN-LABEL: {{^}}load_1d: ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} @@ -11,52 +10,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_1d_tfe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - -; GCN-LABEL: {{^}}load_1d_lwe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { -main_body: - %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_2d: ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { @@ -65,29 +18,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2d_tfe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_3d: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { @@ -96,29 +26,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_3d_tfe_lwe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_cube: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { @@ -127,29 +34,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_cube_lwe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_1darray: ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) { @@ -158,29 +42,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_1darray_tfe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_2darray: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { @@ -189,29 +50,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2darray_lwe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_2dmsaa: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { @@ -220,29 +58,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2dmsaa_both: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_2darraymsaa: ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { @@ -251,29 +66,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_2darraymsaa_tfe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_mip_1d: ; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) { @@ -282,29 +74,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_mip_1d_lwe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}load_mip_2d: ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { @@ -313,191 +82,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}load_mip_2d_tfe: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v4, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} -; SIVI: buffer_store_dword v4, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 -define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - -; Make sure that error flag is returned even with dmask 0 -; GCN-LABEL: {{^}}load_1d_V2_tfe_dmask0: -; GCN: v_mov_b32_e32 v1, 0 -; PRT-DAG: v_mov_b32_e32 v2, v1 -; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}} -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}} -define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { -main_body: - %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) - %v.err = extractvalue {<2 x float>, i32} %v, 1 - %vv = bitcast i32 %v.err to float - ret float %vv -} - -; GCN-LABEL: {{^}}load_1d_V1_tfe_dmask0: -; GCN: v_mov_b32_e32 v1, 0 -; PRT-DAG: v_mov_b32_e32 v2, v1 -; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}} -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}} -define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) { -main_body: - %v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) - %v.err = extractvalue {float, i32} %v, 1 - %vv = bitcast i32 %v.err to float - ret float %vv -} - -; GCN-LABEL: {{^}}load_mip_2d_tfe_dmask0: -; GCN: v_mov_b32_e32 v3, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3 -; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}} -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}} -define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 0, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) - %v.err = extractvalue {<4 x float>, i32} %v, 1 - %vv = bitcast i32 %v.err to float - ret float %vv -} - -; Do not make dmask 0 even if no result (other than tfe) is used. -; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse: -; GCN: v_mov_b32_e32 v3, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3 -; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}} -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}} -define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) - %v.err = extractvalue {<4 x float>, i32} %v, 1 - %vv = bitcast i32 %v.err to float - ret float %vv -} - -; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V2: -; GCN: v_mov_b32_e32 v3, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3 -; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}} -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}} -define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { -main_body: - %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 6, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) - %v.err = extractvalue {<2 x float>, i32} %v, 1 - %vv = bitcast i32 %v.err to float - ret float %vv -} - -; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V1: -; GCN: v_mov_b32_e32 v3, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3 -; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}} -; NOPRT-NOT: v_mov_b32_e32 v2 -; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}} -define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { -main_body: - %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 2, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) - %v.err = extractvalue {float, i32} %v, 1 - %vv = bitcast i32 %v.err to float - ret float %vv -} - -; Check for dmask being materially smaller than return type -; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask3: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v3, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; NOPRT-NOT: v_mov_b32_e32 v2 -; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 unorm tfe{{$}} -; SIVI: buffer_store_dword v3, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v3 -define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - -; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask2: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v2, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; NOPRT-NOT: v_mov_b32_e32 v1 -; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}} -; SIVI: buffer_store_dword v2, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2 -define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - -; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask1: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v1, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}} -; SIVI: buffer_store_dword v1, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1 -define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - -; GCN-LABEL: {{^}}load_1d_tfe_V2_dmask1: -; PRT: v_mov_b32_e32 v0, 0 -; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 -; NOPRT: v_mov_b32_e32 v1, 0 -; NOPRT-NOT: v_mov_b32_e32 v0 -; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}} -; SIVI: buffer_store_dword v1, off, s[8:11], 0 -; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1 -define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { -main_body: - %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) - %v.vec = extractvalue {<2 x float>, i32} %v, 0 - %v.err = extractvalue {<2 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <2 x float> %v.vec -} - - ; GCN-LABEL: {{^}}load_mip_3d: ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) { @@ -820,37 +404,23 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3) store float 0.000000e+00, float addrspace(3)* %lds %c0 = extractelement <2 x i32> %c, i32 0 %c1 = extractelement <2 x i32> %c, i32 1 - %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0) + %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0) %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 store float 0.000000e+00, float addrspace(3)* %tmp2 ret float %tex } declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1 -declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 -declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 -declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll index fd2c6e796c8..1fbfccb0e39 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; GCN-LABEL: {{^}}load.f16.1d: -; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16 +; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16 define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -10,7 +10,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v2f16.1d: -; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16 +; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16 define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -37,7 +37,7 @@ main_body: } ; GCN-LABEL: {{^}}load.f16.2d: -; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16 +; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16 define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -47,7 +47,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v2f16.2d: -; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16 +; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16 define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -77,7 +77,7 @@ main_body: } ; GCN-LABEL: {{^}}load.f16.3d: -; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16 d16 +; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16 define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) { main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -88,7 +88,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v2f16.3d: -; GCN: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm a16 d16 +; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16 define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) { main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll index be579b84eb4..d857ae115a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; GCN-LABEL: {{^}}load.f32.1d: -; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 +; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16 define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -10,7 +10,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v2f32.1d: -; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 +; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16 define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -37,7 +37,7 @@ main_body: } ; GCN-LABEL: {{^}}load.f32.2d: -; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 +; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16 define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -47,7 +47,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v2f32.2d: -; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 +; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16 define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -77,7 +77,7 @@ main_body: } ; GCN-LABEL: {{^}}load.f32.3d: -; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16 +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16 define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) { main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 @@ -88,7 +88,7 @@ main_body: } ; GCN-LABEL: {{^}}load.v2f32.3d: -; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16 define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) { main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll index b6260f4af83..9619304e1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -10,19 +10,6 @@ main_body: ret half %tex } -; GCN-LABEL: {{^}}image_sample_2d_f16_tfe: -; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0 -; PACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}} -; UNPACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}} -define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) { -main_body: - %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) - %tex.vec = extractvalue {half, i32} %tex, 0 - %tex.err = extractvalue {half, i32} %tex, 1 - store i32 %tex.err, i32 addrspace(1)* %out, align 4 - ret half %tex.vec -} - ; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16: ; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}} ; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}} @@ -33,22 +20,6 @@ main_body: ret float %r } -; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16_tfe: -; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0 -; UNPACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}} -; PACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}} -define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { -main_body: - %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) - %tex.vec = extractvalue {<2 x half>, i32} %tex, 0 - %tex.err = extractvalue {<2 x half>, i32} %tex, 1 - %tex.vecf = bitcast <2 x half> %tex.vec to float - %r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0 - %tex.errf = bitcast i32 %tex.err to float - %r = insertelement <2 x float> %r.0, float %tex.errf, i32 1 - ret <2 x float> %r -} - ; GCN-LABEL: {{^}}image_sample_b_2d_v4f16: ; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}} ; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}} @@ -59,33 +30,9 @@ main_body: ret <2 x float> %r } -; GCN-LABEL: {{^}}image_sample_b_2d_v4f16_tfe: -; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0 -; UNPACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}} -; PACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}} -define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { -main_body: - %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) - %tex.vec = extractvalue {<4 x half>, i32} %tex, 0 - %tex.err = extractvalue {<4 x half>, i32} %tex, 1 - %tex.vecf = bitcast <4 x half> %tex.vec to <2 x float> - %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0 - %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1 - %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0 - %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1 - %tex.errf = bitcast i32 %tex.err to float - %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2 - ret <4 x float> %r -} - declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index 2ee69ac6e8b..65f4b46d0ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -9,162 +9,6 @@ main_body: ret <4 x float> %v } -; GCN-LABEL: {{^}}sample_1d_tfe: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: v_mov_b32_e32 v2, v0 -; GCN: v_mov_b32_e32 v3, v0 -; GCN: v_mov_b32_e32 v4, v0 -; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}} -define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - -; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_1: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe{{$}} -define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %res.vec = extractvalue {<4 x float>,i32} %v, 0 - %res.f = extractelement <4 x float> %res.vec, i32 0 - %res.err = extractvalue {<4 x float>,i32} %v, 1 - %res.errf = bitcast i32 %res.err to float - %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 - %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1 - ret <2 x float> %res -} - -; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_2: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe{{$}} -define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %res.vec = extractvalue {<4 x float>,i32} %v, 0 - %res.f = extractelement <4 x float> %res.vec, i32 1 - %res.err = extractvalue {<4 x float>,i32} %v, 1 - %res.errf = bitcast i32 %res.err to float - %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 - %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1 - ret <2 x float> %res -} - -; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_3: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe{{$}} -define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %res.vec = extractvalue {<4 x float>,i32} %v, 0 - %res.f = extractelement <4 x float> %res.vec, i32 2 - %res.err = extractvalue {<4 x float>,i32} %v, 1 - %res.errf = bitcast i32 %res.err to float - %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 - %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1 - ret <2 x float> %res -} - -; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_4: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe{{$}} -define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %res.vec = extractvalue {<4 x float>,i32} %v, 0 - %res.f = extractelement <4 x float> %res.vec, i32 3 - %res.err = extractvalue {<4 x float>,i32} %v, 1 - %res.errf = bitcast i32 %res.err to float - %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0 - %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1 - ret <2 x float> %res -} - -; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_12: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: v_mov_b32_e32 v2, v0 -; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe{{$}} -define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %res.vec = extractvalue {<4 x float>,i32} %v, 0 - %res.f1 = extractelement <4 x float> %res.vec, i32 0 - %res.f2 = extractelement <4 x float> %res.vec, i32 1 - %res.err = extractvalue {<4 x float>,i32} %v, 1 - %res.errf = bitcast i32 %res.err to float - %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0 - %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1 - %res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2 - ret <4 x float> %res -} - -; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_24: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: v_mov_b32_e32 v2, v0 -; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe{{$}} -define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %res.vec = extractvalue {<4 x float>,i32} %v, 0 - %res.f1 = extractelement <4 x float> %res.vec, i32 1 - %res.f2 = extractelement <4 x float> %res.vec, i32 3 - %res.err = extractvalue {<4 x float>,i32} %v, 1 - %res.errf = bitcast i32 %res.err to float - %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0 - %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1 - %res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2 - ret <4 x float> %res -} - -; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_134: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: v_mov_b32_e32 v2, v0 -; GCN: v_mov_b32_e32 v3, v0 -; GCN: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe{{$}} -define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %res.vec = extractvalue {<4 x float>,i32} %v, 0 - %res.f1 = extractelement <4 x float> %res.vec, i32 0 - %res.f2 = extractelement <4 x float> %res.vec, i32 2 - %res.f3 = extractelement <4 x float> %res.vec, i32 3 - %res.err = extractvalue {<4 x float>,i32} %v, 1 - %res.errf = bitcast i32 %res.err to float - %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0 - %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1 - %res.tmp3 = insertelement <4 x float> %res.tmp2, float %res.f3, i32 2 - %res = insertelement <4 x float> %res.tmp3, float %res.errf, i32 3 - ret <4 x float> %res -} - -; GCN-LABEL: {{^}}sample_1d_lwe: -; GCN: v_mov_b32_e32 v0, 0 -; GCN: v_mov_b32_e32 v1, v0 -; GCN: v_mov_b32_e32 v2, v0 -; GCN: v_mov_b32_e32 v3, v0 -; GCN: v_mov_b32_e32 v4, v0 -; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}} -define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { -main_body: - %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0) - %v.vec = extractvalue {<4 x float>, i32} %v, 0 - %v.err = extractvalue {<4 x float>, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret <4 x float> %v.vec -} - ; GCN-LABEL: {{^}}sample_2d: ; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { @@ -517,17 +361,6 @@ main_body: ret float %v } -; GCN-LABEL: {{^}}sample_c_d_o_2darray_V1_tfe: -; GCN: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da{{$}} -define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) { -main_body: - %v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %v.vec = extractvalue {float, i32} %v, 0 - %v.err = extractvalue {float, i32} %v, 1 - store i32 %v.err, i32 addrspace(1)* %out, align 4 - ret float %v.vec -} - ; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2: ; GCN: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da{{$}} define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { @@ -536,22 +369,6 @@ main_body: ret <2 x float> %v } -; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe: -; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}} -define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { -main_body: - %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) - %v.vec = extractvalue {<2 x float>, i32} %v, 0 - %v.f1 = extractelement <2 x float> %v.vec, i32 0 - %v.f2 = extractelement <2 x float> %v.vec, i32 1 - %v.err = extractvalue {<2 x float>, i32} %v, 1 - %v.errf = bitcast i32 %v.err to float - %res.0 = insertelement <4 x float> undef, float %v.f1, i32 0 - %res.1 = insertelement <4 x float> %res.0, float %v.f2, i32 1 - %res.2 = insertelement <4 x float> %res.1, float %v.errf, i32 2 - ret <4 x float> %res.2 -} - ; GCN-LABEL: {{^}}sample_1d_unorm: ; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm{{$}} define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { @@ -674,7 +491,6 @@ main_body: } declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 @@ -726,9 +542,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {float, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll index 45a61ca0ac4..af34a3fd371 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -328,28 +328,6 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 ret float %elt0 } -; Check that the intrinsic remains unchanged in the presence of TFE or LWE -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0) -; CHECK: ret float %elt0 -define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { - %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0) - %data.vec = extractvalue {<4 x float>,i32} %data, 0 - %elt0 = extractelement <4 x float> %data.vec, i32 0 - ret float %elt0 -} - -; Check that the intrinsic remains unchanged in the presence of TFE or LWE -; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe( -; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0) -; CHECK: ret float %elt0 -define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 { - %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0) - %data.vec = extractvalue {<4 x float>,i32} %data, 0 - %elt0 = extractelement <4 x float> %data.vec, i32 0 - ret float %elt0 -} - ; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32( ; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0) ; CHECK-NEXT: ret float %data @@ -528,7 +506,6 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_ } declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 -declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 |

