summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2017-11-25 18:32:43 +0000
committerCraig Topper <craig.topper@intel.com>2017-11-25 18:32:43 +0000
commite485631cd148e18701254634a3003e6bb5797eb2 (patch)
tree6e7e208068025363a98e57a88f3a7ede7f48069e /llvm/lib
parentea37e201ec2f9c3d8b2c9bb37ff48cacdd992f55 (diff)
downloadbcm5719-llvm-e485631cd148e18701254634a3003e6bb5797eb2.tar.gz
bcm5719-llvm-e485631cd148e18701254634a3003e6bb5797eb2.zip
[X86] Add separate intrinsics for scalar FMA4 instructions.
Summary: These instructions zero the non-scalar part of the lower 128-bits which makes them different than the FMA3 instructions which pass through the non-scalar part of the lower 128-bits. I've only added fmadd because we should be able to derive all other variants using operand negation in the intrinsic header like we do for AVX512. I think there are still some missed negate folding opportunities with the FMA4 instructions in light of this behavior difference that I hadn't noticed before. I've split the tests so that we can use different intrinsics for scalar testing between the two. I just copied the tests split the RUN lines and changed out the scalar intrinsics. fma4-fneg-combine.ll is a new test to make sure we negate the fma4 intrinsics correctly though there are a couple TODOs in it. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39851 llvm-svn: 318984
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp12
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/lib/Target/X86/X86InstrFMA.td44
-rw-r--r--llvm/lib/Target/X86/X86InstrFormats.td10
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.td1
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h2
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h2
8 files changed, 56 insertions, 24 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9985b727c0e..2163efd30aa 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -25169,6 +25169,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
+ case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
+ case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
+ case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
+ case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
@@ -35724,6 +35728,13 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
}
+ } else if (N->getOpcode() == X86ISD::FMADD4S) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
+ }
} else {
llvm_unreachable("Unexpected opcode!");
}
@@ -37092,6 +37103,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FMADDS3_RND:
case X86ISD::FMADDS1:
case X86ISD::FMADDS3:
+ case X86ISD::FMADD4S:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3c831001e9a..61b03be52a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -505,6 +505,9 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,
+ // FMA4 specific scalar intrinsics bits that zero the non-scalar bits.
+ FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S,
+
// Scalar intrinsic FMA.
FMADDS1, FMADDS3,
FNMADDS1, FNMADDS3,
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 753351b7dc4..dd6a61ddc3b 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -253,18 +253,18 @@ let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
hasSideEffects = 0 in
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
Operand memopr, RegisterClass RC> {
- def r_Int : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
let mayLoad = 1 in
- def m_Int : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, memopr:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, memopr:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>;
}
// The FMA 213 form is created for lowering of scalar FMA intrinscis
@@ -385,20 +385,20 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
let isCodeGenOnly = 1 in {
- def rr_Int : FMA4S<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
VEX_LIG;
- def rm_Int : FMA4S<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
mem_cpat:$src3)))]>, VEX_W, VEX_LIG;
- def mr_Int : FMA4S<opc, MRMSrcMem, (outs VR128:$dst),
+ def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -406,7 +406,7 @@ let isCodeGenOnly = 1 in {
(VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
VEX_LIG;
let hasSideEffects = 0 in
- def rr_Int_REV : FMA4S<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
@@ -476,18 +476,18 @@ let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
- X86Fmadds1>;
+ X86Fmadd4s>;
defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
- X86Fmsubs1>;
+ X86Fmsub4s>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
X86Fnmadd, loadf32>,
fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
- X86Fnmadds1>;
+ X86Fnmadd4s>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
X86Fnmsub, loadf32>,
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
- X86Fnmsubs1>;
+ X86Fnmsub4s>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32>;
@@ -507,18 +507,18 @@ let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
- X86Fmadds1>;
+ X86Fmadd4s>;
defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
- X86Fmsubs1>;
+ X86Fmsub4s>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
X86Fnmadd, loadf64>,
fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
- X86Fnmadds1>;
+ X86Fnmadd4s>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
X86Fnmsub, loadf64>,
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
- X86Fnmsubs1>;
+ X86Fnmsub4s>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64>;
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 371b5046ff3..2a6ed02fada 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -862,10 +862,14 @@ class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8PD,
- VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>;
+ VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
+class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
// FMA4 Instruction Templates
@@ -877,6 +881,10 @@ class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
+class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index e29c6b19bfd..b013d66a21d 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -506,6 +506,12 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat
def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>;
def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
+// Scalar FMA4 intrinsics which zero the non-scalar bits.
+def X86Fmadd4s : SDNode<"X86ISD::FMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmsub4s : SDNode<"X86ISD::FMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
+
// Scalar FMA intrinsics with passthru bits in operand 1.
def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>;
def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 926080347e7..97d3e6dfb44 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -850,6 +850,7 @@ def NoVLX_Or_NoVPCLMULQDQ :
def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">;
def HasFMA : Predicate<"Subtarget->hasFMA()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
+def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">;
def HasXOP : Predicate<"Subtarget->hasXOP()">;
def HasTBM : Predicate<"Subtarget->hasTBM()">;
def NoTBM : Predicate<"!Subtarget->hasTBM()">;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index d611f02a249..598994d07ad 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1593,6 +1593,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
+ X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index faf88aba177..50e1a742a0f 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -482,7 +482,7 @@ public:
bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
// Prefer FMA4 to FMA - its better for commutation/memory folding and
// has equal or better performance on all supported targets.
- bool hasFMA() const { return HasFMA && !HasFMA4; }
+ bool hasFMA() const { return HasFMA; }
bool hasFMA4() const { return HasFMA4; }
bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
bool hasXOP() const { return HasXOP; }
OpenPOWER on IntegriCloud