Revert r347871 "Fix: Add support for TFE/LWE in image intrinsic"

Also revert fix r347876 One of the buildbots was reporting a failure in some relevant tests that I can't repro or explain at present, so reverting until I can isolate. llvm-svn: 347911
author: David Stuttard <david.stuttard@amd.com> 2018-11-29 20:14:17 +0000
committer: David Stuttard <david.stuttard@amd.com> 2018-11-29 20:14:17 +0000
commit: c6603861d8bad3054ed137b140742eb15abcd3ce (patch)
tree: c663eeb366bb7493cc0160c29fb8e9e31951b8e7 /llvm/lib/Target
parent: eba2365f23db0cae29e9a187ec16bb64e49be5d6 (diff)
download: bcm5719-llvm-c6603861d8bad3054ed137b140742eb15abcd3ce.tar.gz
bcm5719-llvm-c6603861d8bad3054ed137b140742eb15abcd3ce.zip
11 files changed, 54 insertions, 545 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index ea64b125b3b..07ae2bee49b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -42,7 +42,6 @@ FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIFixupVectorISelPass();
-FunctionPass *createSIAddIMGInitPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -154,9 +153,6 @@ extern char &AMDGPUSimplifyLibCallsID;
 void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
 extern char &AMDGPUUseNativeCallsID;
 
-void initializeSIAddIMGInitPass(PassRegistry &);
-extern char &SIAddIMGInitID;
-
 void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
 extern char &AMDGPUPerfHintAnalysisID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 8f20c407ab8..23470c7a4d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -367,16 +367,6 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
   "Use ds_{read|write}_b128"
 >;
 
-// Sparse texture support requires that all result registers are zeroed when
-// PRTStrictNull is set to true. This feature is turned on for all architectures
-// but is enabled as a feature in case there are situations where PRTStrictNull
-// is disabled by the driver.
-def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
-  "EnablePRTStrictNull",
-  "true",
-  "Enable zeroing of result registers for sparse texture fetches"
->;
-
 // Unless +-flat-for-global is specified, turn on FlatForGlobal for
 // all OS-es on VI and newer hardware to avoid assertion failures due
 // to missing ADDR64 variants of MUBUF instructions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1754ead2538..f1acd72b03a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -74,9 +74,6 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // We want to be able to turn these off, but making this a subtarget feature
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
-  //
-  // Similarly we want enable-prt-strict-null to be on by default and not to
-  // unset everything else if it is disabled
 
   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
 
@@ -92,8 +89,6 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
     FullFS += "-fp32-denormals,";
   }
 
-  FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
-
   FullFS += FS;
 
   ParseSubtargetFeatures(GPU, FullFS);
@@ -180,7 +175,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),
     EnableDS128(false),
-    EnablePRTStrictNull(false),
     DumpCode(false),
 
     FP64(false),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index f6b176fe604..886aca42b6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -326,7 +326,6 @@ protected:
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
   bool EnableDS128;
-  bool EnablePRTStrictNull;
   bool DumpCode;
 
   // Subtarget statically properties set by tablegen
@@ -577,12 +576,6 @@ public:
     return getGeneration() < AMDGPUSubtarget::GFX9;
   }
 
-  /// \returns If target requires PRT Struct NULL support (zero result registers
-  /// for sparse texture support).
-  bool usePRTStrictNull() const {
-    return EnablePRTStrictNull;
-  }
-
   bool hasAutoWaitcntBeforeBarrier() const {
     return AutoWaitcntBeforeBarrier;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c2e2129f5de..2198ba8d6c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -815,7 +815,6 @@ bool GCNPassConfig::addInstSelector() {
   addPass(&SIFixSGPRCopiesID);
   addPass(createSILowerI1CopiesPass());
   addPass(createSIFixupVectorISelPass());
-  addPass(createSIAddIMGInitPass());
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dd1b8532aae..bb1096bc1de 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -93,7 +93,6 @@ add_llvm_target(AMDGPUCodeGen
   R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
-  SIAddIMGInit.cpp
   SIAnnotateControlFlow.cpp
   SIDebuggerInsertNops.cpp
   SIFixSGPRCopies.cpp
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 1c68dbd78e7..1462682e761 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -29,7 +29,6 @@ class MIMGBaseOpcode {
   bit Atomic = 0;
   bit AtomicX2 = 0; // (f)cmpswap
   bit Sampler = 0;
-  bit Gather4 = 0;
   bits<8> NumExtraArgs = 0;
   bit Gradients = 0;
   bit Coordinates = 1;
@@ -44,7 +43,7 @@ def MIMGBaseOpcode : GenericEnum {
 def MIMGBaseOpcodesTable : GenericTable {
   let FilterClass = "MIMGBaseOpcode";
   let CppTypeName = "MIMGBaseOpcodeInfo";
-  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
+  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
                 "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
                 "HasD16"];
   GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
@@ -180,8 +179,6 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
     defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
   }
 }
 
@@ -414,8 +411,6 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
     defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
   }
 }
 
@@ -426,7 +421,6 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
                         string asm = "image_gather4"#sample.LowerCaseMod> {
   def "" : MIMG_Sampler_BaseOpcode<sample> {
     let HasD16 = 1;
-    let Gather4 = 1;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -435,8 +429,6 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
     defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
     let VDataDwords = 4 in
     defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
deleted file mode 100644
index 69cafef4a35..00000000000
--- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Any MIMG instructions that use tfe or lwe require an initialization of the
-/// result register that will be written in the case of a memory access failure
-/// The required code is also added to tie this init code to the result of the
-/// img instruction
-///
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-
-#define DEBUG_TYPE "si-img-init"
-
-using namespace llvm;
-
-namespace {
-
-class SIAddIMGInit : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIAddIMGInit() : MachineFunctionPass(ID) {
-    initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
-
-char SIAddIMGInit::ID = 0;
-
-char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
-
-FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
-
-bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo *RI = ST.getRegisterInfo();
-  bool Changed = false;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
-       ++BI) {
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
-
-      auto Opcode = MI.getOpcode();
-      if (TII->isMIMG(Opcode) && !MI.mayStore()) {
-        MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
-        MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
-        MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
-
-        // Check for instructions that don't have tfe or lwe fields
-        // There shouldn't be any at this point.
-        assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
-
-        unsigned TFEVal = TFE->getImm();
-        unsigned LWEVal = LWE->getImm();
-        unsigned D16Val = D16 ? D16->getImm() : 0;
-
-        if (TFEVal || LWEVal) {
-          // At least one of TFE or LWE are non-zero
-          // We have to insert a suitable initialization of the result value and
-          // tie this to the dest of the image instruction.
-
-          const DebugLoc &DL = MI.getDebugLoc();
-
-          int DstIdx =
-              AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
-
-          // Calculate which dword we have to initialize to 0.
-          MachineOperand *MO_Dmask =
-              TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
-
-          // check that dmask operand is found.
-          assert(MO_Dmask && "Expected dmask operand in instruction");
-
-          unsigned dmask = MO_Dmask->getImm();
-          // Determine the number of active lanes taking into account the
-          // Gather4 special case
-          unsigned ActiveLanes =
-              TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
-
-          // Subreg indices are counted from 1
-          // When D16 then we want next whole VGPR after write data.
-          static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
-
-          bool Packed = !ST.hasUnpackedD16VMem();
-
-          unsigned InitIdx =
-              D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
-
-          // Abandon attempt if the dst size isn't large enough
-          // - this is in fact an error but this is picked up elsewhere and
-          // reported correctly.
-          uint32_t DstSize =
-              RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
-          if (DstSize < InitIdx)
-            continue;
-
-          // Create a register for the intialization value.
-          unsigned PrevDst =
-              MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
-          unsigned NewDst = 0; // Final initialized value will be in here
-
-          // If PRTStrictNull feature is enabled (the default) then initialize
-          // all the result registers to 0, otherwise just the error indication
-          // register (VGPRn+1)
-          unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
-          unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
-
-          if (DstSize == 1) {
-            // In this case we can just initialize the result directly
-            BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
-                .addImm(0);
-            NewDst = PrevDst;
-          } else {
-            BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
-            for (; SizeLeft; SizeLeft--, CurrIdx++) {
-              NewDst =
-                  MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
-              // Initialize dword
-              unsigned SubReg =
-                  MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-              BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
-                  .addImm(0);
-              // Insert into the super-reg
-              BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
-                  .addReg(PrevDst)
-                  .addReg(SubReg)
-                  .addImm(CurrIdx);
-
-              PrevDst = NewDst;
-            }
-          }
-
-          // Add as an implicit operand
-          MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
-
-          // Tie the just added implicit operand to the dst
-          MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
-
-          Changed = true;
-        }
-      }
-    }
-  }
-
-  return Changed;
-}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1f53af1b505..9f5198042e4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -216,7 +216,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
@@ -814,47 +813,6 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
-static MVT memVTFromAggregate(Type *Ty) {
-  // Only limited forms of aggregate type currently expected.
-  assert(Ty->isStructTy() && "Expected struct type");
-
-
-  Type *ElementType = nullptr;
-  unsigned NumElts;
-  if (Ty->getContainedType(0)->isVectorTy()) {
-    VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
-    ElementType = VecComponent->getElementType();
-    NumElts = VecComponent->getNumElements();
-  } else {
-    ElementType = Ty->getContainedType(0);
-    NumElts = 1;
-  }
-
-  assert(Ty->getContainedType(1)->isIntegerTy(32) && "Expected int32 type");
-
-  // Calculate the size of the memVT type from the aggregate
-  unsigned Pow2Elts = 0;
-  unsigned ElementSize;
-  switch (ElementType->getTypeID()) {
-    default:
-      llvm_unreachable("Unknown type!");
-    case Type::IntegerTyID:
-      ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
-      break;
-    case Type::HalfTyID:
-      ElementSize = 16;
-      break;
-    case Type::FloatTyID:
-      ElementSize = 32;
-      break;
-  }
-  unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
-  Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
-
-  return MVT::getVectorVT(MVT::getVT(ElementType, false),
-                          Pow2Elts);
-}
-
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
@@ -882,12 +840,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MODereferenceable;
     if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
       Info.opc = ISD::INTRINSIC_W_CHAIN;
-      Info.memVT = MVT::getVT(CI.getType(), true);
-      if (Info.memVT == MVT::Other) {
-        // Some intrinsics return an aggregate type - special case to work out
-        // the correct memVT
-        Info.memVT = memVTFromAggregate(CI.getType());
-      }
+      Info.memVT = MVT::getVT(CI.getType());
       Info.flags |= MachineMemOperand::MOLoad;
     } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
       Info.opc = ISD::INTRINSIC_VOID;
@@ -4660,109 +4613,6 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
   return Value == 0;
 }
 
-// Re-construct the required return value for a image load intrinsic.
-// This is more complicated due to the optional use TexFailCtrl which means the required
-// return type is an aggregate
-static SDValue constructRetValue(SelectionDAG &DAG,
-                                 MachineSDNode *Result,
-                                 ArrayRef<EVT> ResultTypes,
-                                 bool IsTexFail, bool Unpacked, bool IsD16,
-                                 int DMaskPop, int NumVDataDwords,
-                                 const SDLoc &DL, LLVMContext &Context) {
-  // Determine the required return type. This is the same regardless of IsTexFail flag
-  EVT ReqRetVT = ResultTypes[0];
-  EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
-  int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
-  EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
-  EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
-                                           : AdjEltVT
-                       : ReqRetVT;
-
-  // Extract data part of the result
-  // Bitcast the result to the same type as the required return type
-  int NumElts;
-  if (IsD16 && !Unpacked)
-    NumElts = NumVDataDwords << 1;
-  else
-    NumElts = NumVDataDwords;
-
-  EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
-                           : AdjEltVT;
-
-  // Special case for v8f16. Rather than add support for this, use v4i32 to
-  // extract the data elements
-  bool V8F16Special = false;
-  if (CastVT == MVT::v8f16) {
-    CastVT = MVT::v4i32;
-    DMaskPop >>= 1;
-    ReqRetNumElts >>= 1;
-    V8F16Special = true;
-    AdjVT = MVT::v2i32;
-  }
-
-  SDValue N = SDValue(Result, 0);
-  SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
-
-  // Iterate over the result
-  SmallVector<SDValue, 4> BVElts;
-
-  if (CastVT.isVector()) {
-    DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
-  } else {
-    BVElts.push_back(CastRes);
-  }
-  int ExtraElts = ReqRetNumElts - DMaskPop;
-  while(ExtraElts--)
-    BVElts.push_back(DAG.getUNDEF(AdjEltVT));
-
-  SDValue PreTFCRes;
-  if (ReqRetNumElts > 1) {
-    SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
-    if (IsD16 && Unpacked)
-      PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
-    else
-      PreTFCRes = NewVec;
-  } else {
-    PreTFCRes = BVElts[0];
-  }
-
-  if (V8F16Special)
-    PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
-
-  if (!IsTexFail) {
-    if (Result->getNumValues() > 1)
-      return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
-    else
-      return PreTFCRes;
-  }
-
-  // Extract the TexFail result and insert into aggregate return
-  SmallVector<SDValue, 1> TFCElt;
-  DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
-  SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
-  return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
-}
-
-static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
-                         SDValue *LWE, bool &IsTexFail) {
-  auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
-  if (!TexFailCtrlConst)
-    return false;
-
-  uint64_t Value = TexFailCtrlConst->getZExtValue();
-  if (Value) {
-    IsTexFail = true;
-  }
-
-  SDLoc DL(TexFailCtrlConst);
-  *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
-  Value &= ~(uint64_t)0x1;
-  *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
-  Value &= ~(uint64_t)0x2;
-
-  return Value == 0;
-}
-
 SDValue SITargetLowering::lowerImage(SDValue Op,
                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
                                      SelectionDAG &DAG) const {
@@ -4776,17 +4626,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
   unsigned IntrOpcode = Intr->BaseOpcode;
 
-  SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
-  SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
+  SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
   bool IsD16 = false;
   bool IsA16 = false;
   SDValue VData;
   int NumVDataDwords;
-  bool AdjustRetType = false;
-
   unsigned AddrIdx; // Index of first address argument
   unsigned DMask;
-  unsigned DMaskLanes = 0;
 
   if (BaseOpcode->Atomic) {
     VData = Op.getOperand(2);
@@ -4809,12 +4655,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       AddrIdx = 3;
     }
   } else {
-    unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
-    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
-    if (!DMaskConst)
-      return Op;
-    DMask = DMaskConst->getZExtValue();
-    DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
+    unsigned DMaskIdx;
 
     if (BaseOpcode->Store) {
       VData = Op.getOperand(2);
@@ -4830,32 +4671,37 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
       }
 
       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
+      DMaskIdx = 3;
     } else {
-      // Work out the num dwords based on the dmask popcount and underlying type
-      // and whether packing is supported.
-      MVT LoadVT = ResultTypes[0].getSimpleVT();
+      MVT LoadVT = Op.getSimpleValueType();
       if (LoadVT.getScalarType() == MVT::f16) {
         if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
             !BaseOpcode->HasD16)
           return Op; // D16 is unsupported for this instruction
 
         IsD16 = true;
+        if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
+          ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
       }
 
-      // Confirm that the return type is large enough for the dmask specified
-      if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
-          (!LoadVT.isVector() && DMaskLanes > 1))
-          return Op;
-
-      if (IsD16 && !Subtarget->hasUnpackedD16VMem())
-        NumVDataDwords = (DMaskLanes + 1) / 2;
-      else
-        NumVDataDwords = DMaskLanes;
-
-      AdjustRetType = true;
+      NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
+      DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
     }
 
+    auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+    if (!DMaskConst)
+      return Op;
+
     AddrIdx = DMaskIdx + 1;
+    DMask = DMaskConst->getZExtValue();
+    if (!DMask && !BaseOpcode->Store) {
+      // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
+      // store the channels' default values.
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      if (isa<MemSDNode>(Op))
+        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+      return Undef;
+    }
   }
 
   unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
@@ -4934,53 +4780,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     CtrlIdx = AddrIdx + NumVAddrs + 3;
   }
 
-  SDValue TFE;
-  SDValue LWE;
   SDValue TexFail = Op.getOperand(CtrlIdx);
-  bool IsTexFail = false;
-  if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
+  auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
+  if (!TexFailConst || TexFailConst->getZExtValue() != 0)
     return Op;
 
-  if (IsTexFail) {
-    if (!NumVDataDwords) {
-      // Expecting to get an error flag since TFC is on - and dmask is 0
-      // Force dmask to be at least 1 otherwise the instruction will fail
-      DMask = 0x1;
-      DMaskLanes = 1;
-      NumVDataDwords = 1;
-    }
-    NumVDataDwords += 1;
-    AdjustRetType = true;
-  }
-
-  // Has something earlier tagged that the return type needs adjusting
-  // This happens if the instruction is a load or has set TexFailCtrl flags
-  if (AdjustRetType) {
-    // NumVDataDwords reflects the true number of dwords required in the return type
-    if (NumVDataDwords == 0 && !BaseOpcode->Store) {
-      // This is a no-op load. This can be eliminated
-      SDValue Undef = DAG.getUNDEF(Op.getValueType());
-      if (isa<MemSDNode>(Op))
-        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
-      return Undef;
-    }
-
-    // Have to use a power of 2 number of dwords
-    NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
-
-    EVT NewVT = NumVDataDwords > 1 ?
-                  EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
-                : MVT::f32;
-
-    ResultTypes[0] = NewVT;
-    if (ResultTypes.size() == 3) {
-      // Original result was aggregate type used for TexFailCtrl results
-      // The actual instruction returns as a vector type which has now been
-      // created. Remove the aggregate result.
-      ResultTypes.erase(&ResultTypes[1]);
-    }
-  }
-
   SDValue GLC;
   SDValue SLC;
   if (BaseOpcode->Atomic) {
@@ -5005,8 +4809,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   Ops.push_back(SLC);
   Ops.push_back(IsA16 &&  // a16 or r128
                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
-  Ops.push_back(TFE); // tfe
-  Ops.push_back(LWE); // lwe
+  Ops.push_back(False); // tfe
+  Ops.push_back(False); // lwe
   Ops.push_back(DimInfo->DA ? True : False);
   if (BaseOpcode->HasD16)
     Ops.push_back(IsD16 ? True : False);
@@ -5034,12 +4838,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     SmallVector<SDValue, 1> Elt;
     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
-  } else if (!BaseOpcode->Store) {
-    return constructRetValue(DAG, NewNode,
-                             OrigResultTypes, IsTexFail,
-                             Subtarget->hasUnpackedD16VMem(), IsD16,
-                             DMaskLanes, NumVDataDwords, DL,
-                             *DAG.getContext());
+  } else if (IsD16 && !BaseOpcode->Store) {
+    MVT LoadVT = Op.getSimpleValueType();
+    SDValue Adjusted = adjustLoadValueTypeImpl(
+        SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
+    return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
   }
 
   return SDValue(NewNode, 0);
@@ -8969,7 +8772,6 @@ static unsigned SubIdx2Lane(unsigned Idx) {
   case AMDGPU::sub1: return 1;
   case AMDGPU::sub2: return 2;
   case AMDGPU::sub3: return 3;
-  case AMDGPU::sub4: return 4; // Possible with TFE/LWE
   }
 }
 
@@ -8983,16 +8785,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
     return Node; // not implemented for D16
 
-  SDNode *Users[5] = { nullptr };
+  SDNode *Users[4] = { nullptr };
   unsigned Lane = 0;
   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
   unsigned NewDmask = 0;
-  unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
-  unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
-  bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
-                  Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
-  unsigned TFCLane = 0;
   bool HasChain = Node->getNumValues() > 1;
 
   if (OldDmask == 0) {
@@ -9000,12 +8797,6 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
     return Node;
   }
 
-  unsigned OldBitsSet = countPopulation(OldDmask);
-  // Work out which is the TFE/LWE lane if that is enabled.
-  if (UsesTFC) {
-    TFCLane = OldBitsSet;
-  }
-
   // Try to figure out the used register components
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
        I != E; ++I) {
@@ -9025,49 +8816,28 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
     // set, etc.
     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
 
-    // Check if the use is for the TFE/LWE generated result at VGPRn+1.
-    if (UsesTFC && Lane == TFCLane) {
-      Users[Lane] = *I;
-    } else {
-      // Set which texture component corresponds to the lane.
-      unsigned Comp;
-      for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
-        Comp = countTrailingZeros(Dmask);
-        Dmask &= ~(1 << Comp);
-      }
-
-      // Abort if we have more than one user per component.
-      if (Users[Lane])
-        return Node;
-
-      Users[Lane] = *I;
-      NewDmask |= 1 << Comp;
+    // Set which texture component corresponds to the lane.
+    unsigned Comp;
+    for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
+      Comp = countTrailingZeros(Dmask);
+      Dmask &= ~(1 << Comp);
     }
-  }
 
-  // Don't allow 0 dmask, as hardware assumes one channel enabled.
-  bool NoChannels = !NewDmask;
-  if (NoChannels) {
-    // If the original dmask has one channel - then nothing to do
-    if (OldBitsSet == 1)
+    // Abort if we have more than one user per component
+    if (Users[Lane])
       return Node;
-    // Use an arbitrary dmask - required for the instruction to work
-    NewDmask = 1;
+
+    Users[Lane] = *I;
+    NewDmask |= 1 << Comp;
   }
+
   // Abort if there's no change
   if (NewDmask == OldDmask)
     return Node;
 
   unsigned BitsSet = countPopulation(NewDmask);
 
-  // Check for TFE or LWE - increase the number of channels by one to account
-  // for the extra return value
-  // This will need adjustment for D16 if this is also included in
-  // adjustWriteMask (this function) but at present D16 are excluded.
-  unsigned NewChannels = BitsSet + UsesTFC;
-
-  int NewOpcode =
-      AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
+  int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
   assert(NewOpcode != -1 &&
          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
          "failed to find equivalent MIMG op");
@@ -9080,9 +8850,8 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
 
-  MVT ResultVT = NewChannels == 1 ?
-    SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
-                           NewChannels == 5 ? 8 : NewChannels);
+  MVT ResultVT = BitsSet == 1 ?
+    SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
   SDVTList NewVTList = HasChain ?
     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
 
@@ -9096,7 +8865,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
   }
 
-  if (NewChannels == 1) {
+  if (BitsSet == 1) {
     assert(Node->hasNUsesOfValue(1, 0));
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
                                       SDLoc(Node), Users[Lane]->getValueType(0),
@@ -9106,24 +8875,19 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   }
 
   // Update the users of the node with the new indices
-  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
+  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
     SDNode *User = Users[i];
-    if (!User) {
-      // Handle the special case of NoChannels. We set NewDmask to 1 above, but
-      // Users[0] is still nullptr because channel 0 doesn't really have a use.
-      if (i || !NoChannels)
-        continue;
-    } else {
-      SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
-      DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
-    }
+    if (!User)
+      continue;
+
+    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
+    DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
 
     switch (Idx) {
     default: break;
     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
-    case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5d2ca05ec5b..580ceed8b8d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2968,42 +2968,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
-  // Verify MIMG
-  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
-    // Ensure that the return type used is large enough for all the options
-    // being used TFE/LWE require an extra result register.
-    const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
-    if (DMask) {
-      uint64_t DMaskImm = DMask->getImm();
-      uint32_t RegCount =
-          isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
-      const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
-      const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
-      const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
-
-      // Adjust for packed 16 bit values
-      if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
-        RegCount >>= 1;
-
-      // Adjust if using LWE or TFE
-      if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
-        RegCount += 1;
-
-      const uint32_t DstIdx =
-          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
-      const MachineOperand &Dst = MI.getOperand(DstIdx);
-      if (Dst.isReg()) {
-        const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
-        uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
-        if (RegCount > DstSize) {
-          ErrInfo = "MIMG instruction returns too many registers for dst "
-                    "register class";
-          return false;
-        }
-      }
-    }
-  }
-
   // Verify VOP*. Ignore multiple sgpr operands on writelane.
   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2a0416e45cf..4bf16f59621 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -184,7 +184,6 @@ struct MIMGBaseOpcodeInfo {
   bool Atomic;
   bool AtomicX2;
   bool Sampler;
-  bool Gather4;
 
   uint8_t NumExtraArgs;
   bool Gradients;
author	David Stuttard <david.stuttard@amd.com>	2018-11-29 20:14:17 +0000
committer	David Stuttard <david.stuttard@amd.com>	2018-11-29 20:14:17 +0000
commit	c6603861d8bad3054ed137b140742eb15abcd3ce (patch)
tree	c663eeb366bb7493cc0160c29fb8e9e31951b8e7 /llvm/lib/Target
parent	eba2365f23db0cae29e9a187ec16bb64e49be5d6 (diff)
download	bcm5719-llvm-c6603861d8bad3054ed137b140742eb15abcd3ce.tar.gz bcm5719-llvm-c6603861d8bad3054ed137b140742eb15abcd3ce.zip