summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp33
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td21
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td16
7 files changed, 52 insertions, 33 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index ba2aed68fb8..c4ac3180453 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -664,9 +664,10 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
class IntMed3Pat<Instruction med3Inst,
SDPatternOperator max,
SDPatternOperator max_oneuse,
- SDPatternOperator min_oneuse> : Pat<
- (max (min_oneuse i32:$src0, i32:$src1),
- (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+ SDPatternOperator min_oneuse,
+ ValueType vt = i32> : Pat<
+ (max (min_oneuse vt:$src0, vt:$src1),
+ (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
(med3Inst $src0, $src1, $src2)
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 34ae4538428..b99c36ab225 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -276,6 +276,10 @@ public:
return (getGeneration() >= EVERGREEN);
}
+ bool hasMed3_16() const {
+ return getGeneration() >= GFX9;
+ }
+
bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0959707ac9a..143a538c87d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4069,8 +4069,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
}
}
-static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
- SDValue Op0, SDValue Op1, bool Signed) {
+SDValue SITargetLowering::performIntMed3ImmCombine(
+ SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1, bool Signed) const {
ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
if (!K1)
return SDValue();
@@ -4088,23 +4089,22 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
}
EVT VT = K0->getValueType(0);
+ unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
+ if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
+ return DAG.getNode(Med3Opc, SL, VT,
+ Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+ }
+ // If there isn't a 16-bit med3 operation, convert to 32-bit.
MVT NVT = MVT::i32;
unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- SDValue Tmp1, Tmp2, Tmp3;
- Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
- Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
- Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+ SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+ SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+ SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
- if (VT == MVT::i16) {
- Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
- Tmp1, Tmp2, Tmp3);
-
- return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
- } else
- return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
- Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+ SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
}
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
@@ -4141,9 +4141,8 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
}
- // No med3 for f16, but clamp is possible.
- // TODO: gfx9 has med3 f16
- if (VT == MVT::f16 || VT == MVT::f64)
+ // med3 for f16 is only available on gfx9+.
+ if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
return SDValue();
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 5bf39064033..984640dfdb0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -86,6 +86,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) const;
+ SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1, bool Signed) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 1fc3fa81f30..f47b11f9f46 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1321,7 +1321,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
-def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5ec3cc2102a..e74dbacbfb5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1216,6 +1216,14 @@ def : Pat <
// Miscellaneous Optimization Patterns
//============================================================================//
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+ (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+ (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+>;
+
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
@@ -1235,14 +1243,11 @@ class FPMed3Pat<ValueType vt,
def : FPMed3Pat<f32, V_MED3_F32>;
-
-// Undo sub x, c -> add x, -c canonicalization since c is more likely
-// an inline immediate than -c.
-// TODO: Also do for 64-bit.
-def : Pat<
- (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
- (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
->;
+let Predicates = [isGFX9] in {
+def : FPMed3Pat<f16, V_MED3_F16>;
+def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
+def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+} // End Predicates = [isGFX9]
//============================================================================//
// Assembler aliases
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 3ba748839ed..42ccd6d5e19 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -258,8 +258,8 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
let Predicates = [isVI] in {
-multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
- Instruction inst, SDPatternOperator op3> {
+multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst, SDPatternOperator op3> {
def : Pat<
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
(inst i16:$src0, i16:$src1, i16:$src2)
@@ -278,8 +278,8 @@ def : Pat<
>;
}
-defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
-defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
} // End Predicates = [isVI]
@@ -291,6 +291,10 @@ def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
+def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
+def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
+def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
}
@@ -487,3 +491,7 @@ defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
defm V_OR3_B32 : VOP3_Real_vi <0x202>;
defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
+
+defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
+defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
+defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
OpenPOWER on IntegriCloud