9 files changed, 154 insertions, 47 deletions
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
index 4e4b12eacc9..ddf251f38bf 100644
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -211,6 +211,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FSUB, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
   }
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 }
 
 //===----------------------------------------------------------------------===//
@@ -927,6 +941,101 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
 
 }
 
+SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                                      unsigned BitsDiff,
+                                                      SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  SDValue Shift = DAG.getConstant(BitsDiff, VT);
+  // Shift left by 'Shift' bits.
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
+  // Signed shift Right by 'Shift' bits.
+  return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
+}
+
+SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT ScalarVT = VT.getScalarType();
+
+  unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits();
+  unsigned DestBits = ScalarVT.getSizeInBits();
+  unsigned BitsDiff = DestBits - SrcBits;
+
+  if (!Subtarget->hasBFE())
+    return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+
+  SDValue Src = Op.getOperand(0);
+  if (VT.isVector()) {
+    SDLoc DL(Op);
+    // Need to scalarize this, and revisit each of the scalars later.
+    // TODO: Don't scalarize on Evergreen?
+    unsigned NElts = VT.getVectorNumElements();
+    SmallVector<SDValue, 8> Args;
+    ExtractVectorElements(Src, DAG, Args, 0, NElts);
+
+    SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+    for (unsigned I = 0; I < NElts; ++I)
+      Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size());
+  }
+
+  if (SrcBits == 32) {
+    SDLoc DL(Op);
+
+    // If the source is 32-bits, this is really half of a 2-register pair, and
+    // we need to discard the unused half of the pair.
+    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc);
+  }
+
+  unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1;
+
+  // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
+  // might not be worth the effort, and will need to expand to shifts when
+  // fixing SGPR copies.
+  if (SrcBits < 32 && DestBits <= 32) {
+    SDLoc DL(Op);
+    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+
+    if (DestBits != 32)
+      Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src);
+
+    // FIXME: This should use TargetConstant, but that hits assertions for
+    // Evergreen.
+    SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT,
+                              Op.getOperand(0), // Operand
+                              DAG.getConstant(0, ExtVT), // Offset
+                              DAG.getConstant(SrcBits, ExtVT)); // Width
+
+    // Truncate to the original type if necessary.
+    if (ScalarVT == MVT::i32)
+      return Ext;
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext);
+  }
+
+  // For small types, extend to 32-bits first.
+  if (SrcBits < 32) {
+    SDLoc DL(Op);
+    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+
+    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src);
+    SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32,
+                                DL,
+                                ExtVT,
+                                TruncSrc, // Operand
+                                DAG.getConstant(0, ExtVT), // Offset
+                                DAG.getConstant(SrcBits, ExtVT)); // Width
+
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32);
+  }
+
+  // For everything else, use the standard bitshift expansion.
+  return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
@@ -1019,6 +1128,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMIN)
   NODE_NAME_CASE(SMIN)
   NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(BFE_U32)
+  NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(EXPORT)
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h
index 2efb9c78a3e..2595c51d166 100644
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h
@@ -142,6 +142,10 @@ private:
   SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
+                                  unsigned BitsDiff,
+                                  SelectionDAG &DAG) const;
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
   EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -171,6 +175,8 @@ enum {
   UMIN,
   URECIP,
   DOT4,
+  BFE_U32, // Extract range of bits with zero extension to 32-bits.
+  BFE_I32, // Extract range of bits with sign extension to 32-bits.
   TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
diff --git a/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/llvm/lib/Target/R600/AMDGPUInstrInfo.td
index fccede01ab9..2138bd23a36 100644
--- a/llvm/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/R600/AMDGPUInstrInfo.td
@@ -86,3 +86,7 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
 
 def AMDGPUround : SDNode<"ISD::FROUND",
                          SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
+
+def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+
diff --git a/llvm/lib/Target/R600/AMDGPUSubtarget.h b/llvm/lib/Target/R600/AMDGPUSubtarget.h
index 7e7f4d0c004..8874d14c18c 100644
--- a/llvm/lib/Target/R600/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/R600/AMDGPUSubtarget.h
@@ -68,6 +68,15 @@ public:
   enum Generation getGeneration() const;
   bool hasHWFP64() const;
   bool hasCaymanISA() const;
+
+  bool hasBFE() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasBFM() const {
+    return hasBFE();
+  }
+
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
diff --git a/llvm/lib/Target/R600/AMDILISelLowering.cpp b/llvm/lib/Target/R600/AMDILISelLowering.cpp
index 970787ef31e..5dfaad4c1c3 100644
--- a/llvm/lib/Target/R600/AMDILISelLowering.cpp
+++ b/llvm/lib/Target/R600/AMDILISelLowering.cpp
@@ -94,9 +94,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
   for (unsigned int x  = 0; x < NumTypes; ++x) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
 
-    //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
-    // We cannot sextinreg, expand to shifts
-    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
     setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
@@ -191,14 +188,12 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
   setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
   setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
   setOperationAction(ISD::SUBC, MVT::Other, Expand);
   setOperationAction(ISD::ADDE, MVT::Other, Expand);
   setOperationAction(ISD::ADDC, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 
 
   // Use the default implementation.
@@ -322,36 +317,6 @@ AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
   return DST;
 }
 
-SDValue
-AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Data = Op.getOperand(0);
-  VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
-  SDLoc DL(Op);
-  EVT DVT = Data.getValueType();
-  EVT BVT = BaseType->getVT();
-  unsigned baseBits = BVT.getScalarType().getSizeInBits();
-  unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
-  unsigned shiftBits = srcBits - baseBits;
-  if (srcBits < 32) {
-    // If the op is less than 32 bits, then it needs to extend to 32bits
-    // so it can properly keep the upper bits valid.
-    EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
-    Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
-    shiftBits = 32 - baseBits;
-    DVT = IVT;
-  }
-  SDValue Shift = DAG.getConstant(shiftBits, DVT);
-  // Shift left by 'Shift' bits.
-  Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
-  // Signed shift Right by 'Shift' bits.
-  Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
-  if (srcBits < 32) {
-    // Once the sign extension is done, the op needs to be converted to
-    // its original type.
-    Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
-  }
-  return Data;
-}
 EVT
 AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
   int iSize = (size * numEle);
diff --git a/llvm/lib/Target/R600/AMDILIntrinsics.td b/llvm/lib/Target/R600/AMDILIntrinsics.td
index 6ec3559af24..658deb5bc01 100644
--- a/llvm/lib/Target/R600/AMDILIntrinsics.td
+++ b/llvm/lib/Target/R600/AMDILIntrinsics.td
@@ -68,10 +68,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
 let TargetPrefix = "AMDIL", isTarget = 1 in {
   def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
 
-  def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
-          TernaryIntInt;
-  def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
-          TernaryIntInt;
   def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
           UnaryIntInt;
   def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
diff --git a/llvm/lib/Target/R600/R600ISelLowering.cpp b/llvm/lib/Target/R600/R600ISelLowering.cpp
index 8c737125c85..4d15321fd02 100644
--- a/llvm/lib/Target/R600/R600ISelLowering.cpp
+++ b/llvm/lib/Target/R600/R600ISelLowering.cpp
@@ -1383,6 +1383,11 @@ SDValue R600TargetLowering::LowerFormalArguments(
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                                    AMDGPUAS::CONSTANT_BUFFER_0);
 
+    // i64 isn't a legal type, so the register type used ends up as i32, which
+    // isn't expected here. It attempts to create this sextload, but it ends up
+    // being invalid. Somehow this seems to work with i64 arguments, but breaks
+    // for <1 x i64>.
+
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
     SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
diff --git a/llvm/lib/Target/R600/R600Instructions.td b/llvm/lib/Target/R600/R600Instructions.td
index 698ad4afe60..ae3d8747a4d 100644
--- a/llvm/lib/Target/R600/R600Instructions.td
+++ b/llvm/lib/Target/R600/R600Instructions.td
@@ -1517,15 +1517,20 @@ let Predicates = [isEGorCayman] in {
   // Example Usage:
   // (Offset, Width)
   //
-  // (0, 8)           = (Input << 24) >> 24  = (Input &  0xff)       >> 0
-  // (8, 8)           = (Input << 16) >> 24  = (Input &  0xffff)     >> 8
-  // (16,8)           = (Input <<  8) >> 24  = (Input &  0xffffff)   >> 16
-  // (24,8)           = (Input <<  0) >> 24  = (Input &  0xffffffff) >> 24
+  // (0, 8)  = (Input << 24) >> 24 = (Input &  0xff)       >> 0
+  // (8, 8)  = (Input << 16) >> 24 = (Input &  0xffff)     >> 8
+  // (16, 8) = (Input <<  8) >> 24 = (Input &  0xffffff)   >> 16
+  // (24, 8) = (Input <<  0) >> 24 = (Input &  0xffffffff) >> 24
   def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
-    [(set i32:$dst, (int_AMDIL_bit_extract_u32 i32:$src0, i32:$src1,
-                                               i32:$src2))],
+    [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))],
     VecALU
   >;
+
+  def BFE_INT_eg : R600_3OP <0x4, "BFE_INT",
+    [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
+    VecALU
+  >;
+
 // XXX: This pattern is broken, disabling for now.  See comment in
 // AMDGPUInstructions.td for more info.
 //  def : BFEPattern <BFE_UINT_eg>;
diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td
index 9a18f7bc350..68b89a8c351 100644
--- a/llvm/lib/Target/R600/SIInstructions.td
+++ b/llvm/lib/Target/R600/SIInstructions.td
@@ -1074,8 +1074,14 @@ def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
 def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
 def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
 def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
-def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
-def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
+
+let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
+  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
+def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
+  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
+}
+
 def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
 defm : BFIPatterns <V_BFI_B32>;
 def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",