diff options
| author | Craig Topper <craig.topper@intel.com> | 2017-11-25 18:32:43 +0000 | 
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2017-11-25 18:32:43 +0000 | 
| commit | e485631cd148e18701254634a3003e6bb5797eb2 (patch) | |
| tree | 6e7e208068025363a98e57a88f3a7ede7f48069e /llvm/lib | |
| parent | ea37e201ec2f9c3d8b2c9bb37ff48cacdd992f55 (diff) | |
| download | bcm5719-llvm-e485631cd148e18701254634a3003e6bb5797eb2.tar.gz bcm5719-llvm-e485631cd148e18701254634a3003e6bb5797eb2.zip | |
[X86] Add separate intrinsics for scalar FMA4 instructions.
Summary:
These instructions zero the non-scalar part of the lower 128-bits which makes them different than the FMA3 instructions which pass through the non-scalar part of the lower 128-bits.
I've only added fmadd because we should be able to derive all other variants using operand negation in the intrinsic header like we do for AVX512.
I think there are still some missed negate folding opportunities with the FMA4 instructions in light of this behavior difference that I hadn't noticed before.
I've split the tests so that we can use different intrinsics for scalar testing between the two. I just copied the tests split the RUN lines and changed out the scalar intrinsics.
fma4-fneg-combine.ll is a new test to make sure we negate the fma4 intrinsics correctly though there are a couple TODOs in it.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D39851
llvm-svn: 318984
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFMA.td | 44 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFormats.td | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.td | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 2 | 
8 files changed, 56 insertions, 24 deletions
| diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9985b727c0e..2163efd30aa 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25169,6 +25169,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {    case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";    case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";    case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND"; +  case X86ISD::FMADD4S:            return "X86ISD::FMADD4S"; +  case X86ISD::FNMADD4S:           return "X86ISD::FNMADD4S"; +  case X86ISD::FMSUB4S:            return "X86ISD::FMSUB4S"; +  case X86ISD::FNMSUB4S:           return "X86ISD::FNMSUB4S";    case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";    case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";    case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE"; @@ -35724,6 +35728,13 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,      case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;      case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;      } +  } else if (N->getOpcode() == X86ISD::FMADD4S) { +    switch (NewOpcode) { +    case ISD::FMA:       NewOpcode = X86ISD::FMADD4S; break; +    case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB4S; break; +    case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break; +    case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break; +    }    } else {      llvm_unreachable("Unexpected opcode!");    } @@ -37092,6 +37103,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,    case X86ISD::FMADDS3_RND:    case X86ISD::FMADDS1:    case X86ISD::FMADDS3: +  case X86ISD::FMADD4S:    case ISD::FMA:            return combineFMA(N, DAG, Subtarget);    case X86ISD::FMADDSUB_RND:    case X86ISD::FMSUBADD_RND: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 3c831001e9a..61b03be52a9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -505,6 +505,9 @@ namespace llvm {        FMADDSUB_RND,        FMSUBADD_RND, +      // FMA4 specific scalar intrinsics bits that zero the non-scalar bits. +      FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S, +        // Scalar intrinsic FMA.        FMADDS1, FMADDS3,        FNMADDS1, FNMADDS3, diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index 753351b7dc4..dd6a61ddc3b 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -253,18 +253,18 @@ let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,      hasSideEffects = 0 in  multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,                          Operand memopr, RegisterClass RC> { -  def r_Int : FMA3S<opc, MRMSrcReg, (outs RC:$dst), -                    (ins RC:$src1, RC:$src2, RC:$src3), -                    !strconcat(OpcodeStr, -                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), -                    []>; +  def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst), +                        (ins RC:$src1, RC:$src2, RC:$src3), +                        !strconcat(OpcodeStr, +                                   "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), +                        []>;    let mayLoad = 1 in -  def m_Int : FMA3S<opc, MRMSrcMem, (outs RC:$dst), -                    (ins RC:$src1, RC:$src2, memopr:$src3), -                    !strconcat(OpcodeStr, -                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), -                    []>; +  def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst), +                        (ins RC:$src1, RC:$src2, memopr:$src3), +                        !strconcat(OpcodeStr, +                                   "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), +                        []>;  }  // The FMA 213 form is created for lowering of scalar FMA intrinscis @@ -385,20 +385,20 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in  multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,                       ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {  let isCodeGenOnly = 1 in { -  def rr_Int : FMA4S<opc, MRMSrcRegOp4, (outs VR128:$dst), +  def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),                 (ins VR128:$src1, VR128:$src2, VR128:$src3),                 !strconcat(OpcodeStr,                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),                 [(set VR128:$dst,                   (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,                 VEX_LIG; -  def rm_Int : FMA4S<opc, MRMSrcMemOp4, (outs VR128:$dst), +  def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),                 (ins VR128:$src1, VR128:$src2, memop:$src3),                 !strconcat(OpcodeStr,                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),                 [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,                                    mem_cpat:$src3)))]>, VEX_W, VEX_LIG; -  def mr_Int : FMA4S<opc, MRMSrcMem, (outs VR128:$dst), +  def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),                 (ins VR128:$src1, memop:$src2, VR128:$src3),                 !strconcat(OpcodeStr,                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -406,7 +406,7 @@ let isCodeGenOnly = 1 in {                   (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,                 VEX_LIG;  let hasSideEffects = 0 in -  def rr_Int_REV : FMA4S<opc, MRMSrcReg, (outs VR128:$dst), +  def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),                 (ins VR128:$src1, VR128:$src2, VR128:$src3),                 !strconcat(OpcodeStr,                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -476,18 +476,18 @@ let ExeDomain = SSEPackedSingle in {    // Scalar Instructions    defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,                      fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32, -                              X86Fmadds1>; +                              X86Fmadd4s>;    defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,                      fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32, -                              X86Fmsubs1>; +                              X86Fmsub4s>;    defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,                            X86Fnmadd, loadf32>,                      fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32, -                              X86Fnmadds1>; +                              X86Fnmadd4s>;    defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,                            X86Fnmsub, loadf32>,                      fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32, -                              X86Fnmsubs1>; +                              X86Fnmsub4s>;    // Packed Instructions    defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,                              loadv4f32, loadv8f32>; @@ -507,18 +507,18 @@ let ExeDomain = SSEPackedDouble in {    // Scalar Instructions    defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,                      fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64, -                              X86Fmadds1>; +                              X86Fmadd4s>;    defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,                      fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64, -                              X86Fmsubs1>; +                              X86Fmsub4s>;    defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,                            X86Fnmadd, loadf64>,                      fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64, -                              X86Fnmadds1>; +                              X86Fnmadd4s>;    defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,                            X86Fnmsub, loadf64>,                      fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64, -                              X86Fnmsubs1>; +                              X86Fnmsub4s>;    // Packed Instructions    defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,                              loadv2f64, loadv4f64>; diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td index 371b5046ff3..2a6ed02fada 100644 --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -862,10 +862,14 @@ class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,  class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,             list<dag>pattern, InstrItinClass itin = NoItinerary>        : I<o, F, outs, ins, asm, pattern, itin>, T8PD, -        VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>; +        VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;  class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,              list<dag>pattern, InstrItinClass itin = NoItinerary>        : I<o, F, outs, ins, asm, pattern, itin>, T8PD, +        VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>; +class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm, +                list<dag>pattern, InstrItinClass itin = NoItinerary> +      : I<o, F, outs, ins, asm, pattern, itin>, T8PD,          VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;  // FMA4 Instruction Templates @@ -877,6 +881,10 @@ class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,              list<dag>pattern, InstrItinClass itin = NoItinerary>        : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,          VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>; +class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm, +                list<dag>pattern, InstrItinClass itin = NoItinerary> +      : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD, +        VEX_4V, FMASC, Requires<[HasFMA4]>;  // XOP 2, 3 and 4 Operand Instruction Template  class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index e29c6b19bfd..b013d66a21d 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -506,6 +506,12 @@ def X86FnmsubRnd    : SDNode<"X86ISD::FNMSUB_RND",    SDTFmaRound, [SDNPCommutat  def X86FmaddsubRnd  : SDNode<"X86ISD::FMADDSUB_RND",  SDTFmaRound, [SDNPCommutative]>;  def X86FmsubaddRnd  : SDNode<"X86ISD::FMSUBADD_RND",  SDTFmaRound, [SDNPCommutative]>; +// Scalar FMA4 intrinsics which zero the non-scalar bits. +def X86Fmadd4s  : SDNode<"X86ISD::FMADD4S",  SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmsub4s  : SDNode<"X86ISD::FMSUB4S",  SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>; +  // Scalar FMA intrinsics with passthru bits in operand 1.  def X86Fmadds1  : SDNode<"X86ISD::FMADDS1",  SDTFPTernaryOp>;  def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 926080347e7..97d3e6dfb44 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -850,6 +850,7 @@ def NoVLX_Or_NoVPCLMULQDQ :  def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">;  def HasFMA       : Predicate<"Subtarget->hasFMA()">;  def HasFMA4      : Predicate<"Subtarget->hasFMA4()">; +def NoFMA4       : Predicate<"!Subtarget->hasFMA4()">;  def HasXOP       : Predicate<"Subtarget->hasXOP()">;  def HasTBM       : Predicate<"Subtarget->hasTBM()">;  def NoTBM        : Predicate<"!Subtarget->hasTBM()">; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index d611f02a249..598994d07ad 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1593,6 +1593,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {    X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),    X86_INTRINSIC_DATA(fma_vfnmsub_sd,       INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),    X86_INTRINSIC_DATA(fma_vfnmsub_ss,       INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0), +  X86_INTRINSIC_DATA(fma4_vfmadd_sd,       INTR_TYPE_3OP, X86ISD::FMADD4S, 0), +  X86_INTRINSIC_DATA(fma4_vfmadd_ss,       INTR_TYPE_3OP, X86ISD::FMADD4S, 0),    X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),    X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),    X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE), diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index faf88aba177..50e1a742a0f 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -482,7 +482,7 @@ public:    bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }    // Prefer FMA4 to FMA - its better for commutation/memory folding and    // has equal or better performance on all supported targets. -  bool hasFMA() const { return HasFMA && !HasFMA4; } +  bool hasFMA() const { return HasFMA; }    bool hasFMA4() const { return HasFMA4; }    bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }    bool hasXOP() const { return HasXOP; } | 

