diff options
author | Craig Topper <craig.topper@intel.com> | 2018-07-06 07:14:41 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-07-06 07:14:41 +0000 |
commit | c60e1807b3f099d67a3ed7ceb77dd4257c6d61b6 (patch) | |
tree | 36e8e27b9cd56b8c98100d06f58ff23d41210984 | |
parent | 7525edc89078f138c9dad6bdb3c5eed31f65c017 (diff) | |
download | bcm5719-llvm-c60e1807b3f099d67a3ed7ceb77dd4257c6d61b6.tar.gz bcm5719-llvm-c60e1807b3f099d67a3ed7ceb77dd4257c6d61b6.zip |
[X86] Remove FMA4 scalar intrinsics. Use llvm.fma intrinsic instead.
The intrinsics can be implemented with a f32/f64 llvm.fma intrinsic and an insert into a zero vector.
There are a couple regressions here due to SelectionDAG not being able to pull an fneg through an extract_vector_elt. I'm not super worried about this though as InstCombine should be able to do it before we get to SelectionDAG.
llvm-svn: 336416
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 9 | ||||
-rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFMA.td | 89 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/fma4-commute-x86.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/fma4-fneg-combine.ll | 28 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/fma4-scalar-memfold.ll | 4 |
10 files changed, 105 insertions, 80 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index d019bed68db..aa25693a0a2 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1903,15 +1903,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // FMA3 and FMA4 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_fma4_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">, - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], - [IntrNoMem]>; - def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">, - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], - [IntrNoMem]>; - def int_x86_avx512_vfmadd_pd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index ac224b38d26..9c9e5570184 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -74,6 +74,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { if (Name=="ssse3.pabs.b.128" || // Added in 6.0 Name=="ssse3.pabs.w.128" || // Added in 6.0 Name=="ssse3.pabs.d.128" || // Added in 6.0 + Name.startswith("fma4.vfmadd.s") || // Added in 7.0 Name.startswith("fma.vfmadd.") || // Added in 7.0 Name.startswith("fma.vfmsub.") || // Added in 7.0 Name.startswith("fma.vfmaddsub.") || // Added in 7.0 @@ -2790,6 +2791,21 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (IsScalar) Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0); + } else if (IsX86 && Name.startswith("fma4.vfmadd.s")) { + Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2) }; + + Ops[0] = Builder.CreateExtractElement(Ops[0], (uint64_t)0); + Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0); + Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0); + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), + Intrinsic::fma, + Ops[0]->getType()), + Ops); + + Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()), + Rep, (uint64_t)0); } else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.p") || Name.startswith("avx512.mask.vfnmadd.p") || Name.startswith("avx512.mask.vfnmsub.p") || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dc16d06e037..8174215860b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -26099,10 +26099,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND"; - case X86ISD::FMADD4S: return "X86ISD::FMADD4S"; - case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S"; - case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S"; - case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S"; case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; @@ -37709,28 +37705,24 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) { case X86ISD::FMADDS3: Opcode = X86ISD::FNMADDS3; break; case X86ISD::FMADDS1_RND: Opcode = X86ISD::FNMADDS1_RND; break; case X86ISD::FMADDS3_RND: Opcode = X86ISD::FNMADDS3_RND; break; - case X86ISD::FMADD4S: Opcode = X86ISD::FNMADD4S; break; case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMSUBS1: Opcode = X86ISD::FNMSUBS1; break; case X86ISD::FMSUBS3: Opcode = X86ISD::FNMSUBS3; break; case X86ISD::FMSUBS1_RND: Opcode = X86ISD::FNMSUBS1_RND; break; case X86ISD::FMSUBS3_RND: Opcode = X86ISD::FNMSUBS3_RND; break; - case X86ISD::FMSUB4S: Opcode = X86ISD::FNMSUB4S; break; case X86ISD::FNMADD: Opcode = ISD::FMA; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FNMADDS1: Opcode = X86ISD::FMADDS1; break; case X86ISD::FNMADDS3: Opcode = X86ISD::FMADDS3; break; case X86ISD::FNMADDS1_RND: Opcode = X86ISD::FMADDS1_RND; break; case X86ISD::FNMADDS3_RND: Opcode = X86ISD::FMADDS3_RND; break; - case X86ISD::FNMADD4S: Opcode = X86ISD::FMADD4S; break; case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; case X86ISD::FNMSUBS1: Opcode = X86ISD::FMSUBS1; break; case X86ISD::FNMSUBS3: Opcode = X86ISD::FMSUBS3; break; case X86ISD::FNMSUBS1_RND: Opcode = X86ISD::FMSUBS1_RND; break; case X86ISD::FNMSUBS3_RND: Opcode = X86ISD::FMSUBS3_RND; break; - case X86ISD::FNMSUB4S: Opcode = X86ISD::FMSUB4S; break; } } @@ -37743,28 +37735,24 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) { case X86ISD::FMADDS3: Opcode = X86ISD::FMSUBS3; break; case X86ISD::FMADDS1_RND: Opcode = X86ISD::FMSUBS1_RND; break; case X86ISD::FMADDS3_RND: Opcode = X86ISD::FMSUBS3_RND; break; - case X86ISD::FMADD4S: Opcode = X86ISD::FMSUB4S; break; case X86ISD::FMSUB: Opcode = ISD::FMA; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FMSUBS1: Opcode = X86ISD::FMADDS1; break; case X86ISD::FMSUBS3: Opcode = X86ISD::FMADDS3; break; case X86ISD::FMSUBS1_RND: Opcode = X86ISD::FMADDS1_RND; break; case X86ISD::FMSUBS3_RND: Opcode = X86ISD::FMADDS3_RND; break; - case X86ISD::FMSUB4S: Opcode = X86ISD::FMADD4S; break; case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FNMADDS1: Opcode = X86ISD::FNMSUBS1; break; case X86ISD::FNMADDS3: Opcode = X86ISD::FNMSUBS3; break; case X86ISD::FNMADDS1_RND: Opcode = X86ISD::FNMSUBS1_RND; break; case X86ISD::FNMADDS3_RND: Opcode = X86ISD::FNMSUBS3_RND; break; - case X86ISD::FNMADD4S: Opcode = X86ISD::FNMSUB4S; break; case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMSUBS1: Opcode = X86ISD::FNMADDS1; break; case X86ISD::FNMSUBS3: Opcode = X86ISD::FNMADDS3; break; case X86ISD::FNMSUBS1_RND: Opcode = X86ISD::FNMADDS1_RND; break; case X86ISD::FNMSUBS3_RND: Opcode = X86ISD::FNMADDS3_RND; break; - case X86ISD::FNMSUB4S: Opcode = X86ISD::FNMADD4S; break; } } @@ -39447,28 +39435,24 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMADDS3_RND: case X86ISD::FMADDS1: case X86ISD::FMADDS3: - case X86ISD::FMADD4S: case X86ISD::FMSUB: case X86ISD::FMSUB_RND: case X86ISD::FMSUBS1_RND: case X86ISD::FMSUBS3_RND: case X86ISD::FMSUBS1: case X86ISD::FMSUBS3: - case X86ISD::FMSUB4S: case X86ISD::FNMADD: case X86ISD::FNMADD_RND: case X86ISD::FNMADDS1_RND: case X86ISD::FNMADDS3_RND: case X86ISD::FNMADDS1: case X86ISD::FNMADDS3: - case X86ISD::FNMADD4S: case X86ISD::FNMSUB: case X86ISD::FNMSUB_RND: case X86ISD::FNMSUBS1_RND: case X86ISD::FNMSUBS3_RND: case X86ISD::FNMSUBS1: case X86ISD::FNMSUBS3: - case X86ISD::FNMSUB4S: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 5f73b36d298..d7e33442181 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -501,9 +501,6 @@ namespace llvm { FMADDSUB_RND, FMSUBADD_RND, - // FMA4 specific scalar intrinsics bits that zero the non-scalar bits. - FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S, - // Scalar intrinsic FMA. FMADDS1, FMADDS3, FNMADDS1, FNMADDS3, diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index 50d38d9f89d..f2cf8029172 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -372,7 +372,7 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, (Op RC:$src2, (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), RC:$src3))))), - (!cast<I>(Prefix#"213"#Suffix#"r_Int") + (!cast<Instruction>(Prefix#"213"#Suffix#"r_Int") VR128:$src1, (COPY_TO_REGCLASS RC:$src2, VR128), (COPY_TO_REGCLASS RC:$src3, VR128))>; } @@ -432,36 +432,32 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in } multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, - ValueType VT, ComplexPattern mem_cpat, SDNode OpNode, - X86FoldableSchedWrite sched> { -let isCodeGenOnly = 1 in { + ValueType VT, X86FoldableSchedWrite sched> { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W, - VEX_LIG, Sched<[sched]>; + []>, VEX_W, VEX_LIG, Sched<[sched]>; + let mayLoad = 1 in def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2, - mem_cpat:$src3)))]>, VEX_W, VEX_LIG, + []>, VEX_W, VEX_LIG, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + let mayLoad = 1 in def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>, + []>, VEX_LIG, Sched<[sched.Folded, ReadAfterLd, // memop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, // VR128::$src3 ReadAfterLd]>; -let hasSideEffects = 0 in def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -547,20 +543,20 @@ let ExeDomain = SSEPackedSingle in { // Scalar Instructions defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32, SchedWriteFMA.Scl>, - fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32, - X86Fmadd4s, SchedWriteFMA.Scl>; + fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, + SchedWriteFMA.Scl>; defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32, SchedWriteFMA.Scl>, - fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32, - X86Fmsub4s, SchedWriteFMA.Scl>; + fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, + SchedWriteFMA.Scl>; defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, X86Fnmadd, loadf32, SchedWriteFMA.Scl>, - fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32, - X86Fnmadd4s, SchedWriteFMA.Scl>; + fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, + SchedWriteFMA.Scl>; defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, X86Fnmsub, loadf32, SchedWriteFMA.Scl>, - fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32, - X86Fnmsub4s, SchedWriteFMA.Scl>; + fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, + SchedWriteFMA.Scl>; // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; @@ -580,20 +576,20 @@ let ExeDomain = SSEPackedDouble in { // Scalar Instructions defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64, SchedWriteFMA.Scl>, - fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64, - X86Fmadd4s, SchedWriteFMA.Scl>; + fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, + SchedWriteFMA.Scl>; defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64, SchedWriteFMA.Scl>, - fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64, - X86Fmsub4s, SchedWriteFMA.Scl>; + fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, + SchedWriteFMA.Scl>; defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, X86Fnmadd, loadf64, SchedWriteFMA.Scl>, - fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64, - X86Fnmadd4s, SchedWriteFMA.Scl>; + fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, + SchedWriteFMA.Scl>; defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, X86Fnmsub, loadf64, SchedWriteFMA.Scl>, - fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64, - X86Fnmsub4s, SchedWriteFMA.Scl>; + fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, + SchedWriteFMA.Scl>; // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; @@ -609,3 +605,40 @@ let ExeDomain = SSEPackedDouble in { loadv2f64, loadv4f64, SchedWriteFMA>; } +multiclass scalar_fma4_patterns<SDNode Op, string Name, + SDNode Move, ValueType VT, ValueType EltVT, + RegisterClass RC, PatFrag mem_frag> { + let Predicates = [HasFMA4] in { + let AddedComplexity = 15 in + def : Pat<(VT (X86vzmovl (VT (scalar_to_vector + (Op RC:$src1, RC:$src2, RC:$src3))))), + (!cast<Instruction>(Name#"rr_Int") + (COPY_TO_REGCLASS RC:$src1, VR128), + (COPY_TO_REGCLASS RC:$src2, VR128), + (COPY_TO_REGCLASS RC:$src3, VR128))>; + + def : Pat<(VT (X86vzmovl (VT (scalar_to_vector + (Op RC:$src1, RC:$src2, + (mem_frag addr:$src3)))))), + (!cast<Instruction>(Name#"rm_Int") + (COPY_TO_REGCLASS RC:$src1, VR128), + (COPY_TO_REGCLASS RC:$src2, VR128), addr:$src3)>; + + def : Pat<(VT (X86vzmovl (VT (scalar_to_vector + (Op RC:$src1, (mem_frag addr:$src2), + RC:$src3))))), + (!cast<Instruction>(Name#"mr_Int") + (COPY_TO_REGCLASS RC:$src1, VR128), addr:$src2, + (COPY_TO_REGCLASS RC:$src3, VR128))>; + } +} + +defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSS4", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", X86Movss, v4f32, f32, FR32, loadf32>; + +defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSD4", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", X86Movsd, v2f64, f64, FR64, loadf64>; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 824ad7191ec..cab47881bb4 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -483,12 +483,6 @@ def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutat def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>; def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>; -// Scalar FMA4 intrinsics which zero the non-scalar bits. -def X86Fmadd4s : SDNode<"X86ISD::FMADD4S", SDTFPTernaryOp, [SDNPCommutative]>; -def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>; -def X86Fmsub4s : SDNode<"X86ISD::FMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>; -def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>; - // Scalar FMA intrinsics with passthru bits in operand 1. def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>; def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 0442759d584..9f712d848e2 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1157,8 +1157,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), - X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), - X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), diff --git a/llvm/test/CodeGen/X86/fma4-commute-x86.ll b/llvm/test/CodeGen/X86/fma4-commute-x86.ll index cfc6837e453..9a1724f1373 100644 --- a/llvm/test/CodeGen/X86/fma4-commute-x86.ll +++ b/llvm/test/CodeGen/X86/fma4-commute-x86.ll @@ -7,7 +7,7 @@ declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float>, <4 x float>, <4 x floa define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_baa_ss: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmaddss %xmm0, (%rdx), %xmm0, %xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind @@ -17,7 +17,7 @@ define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 { define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_aba_ss: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rcx), %xmm0 +; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmaddss %xmm0, (%rdx), %xmm0, %xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind @@ -27,7 +27,7 @@ define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 { define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_bba_ss: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps (%rdx), %xmm0 +; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmaddss (%rcx), %xmm0, %xmm0, %xmm0 ; FMA4-NEXT: retq %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind @@ -100,7 +100,7 @@ declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double>, <2 x double>, <2 x d define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_baa_sd: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovapd (%rcx), %xmm0 +; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA4-NEXT: vfmaddsd %xmm0, (%rdx), %xmm0, %xmm0 ; FMA4-NEXT: retq %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind @@ -110,7 +110,7 @@ define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_aba_sd: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovapd (%rcx), %xmm0 +; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA4-NEXT: vfmaddsd %xmm0, (%rdx), %xmm0, %xmm0 ; FMA4-NEXT: retq %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind @@ -120,7 +120,7 @@ define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 { ; FMA4-LABEL: test_x86_fmadd_bba_sd: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovapd (%rdx), %xmm0 +; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA4-NEXT: vfmaddsd (%rcx), %xmm0, %xmm0, %xmm0 ; FMA4-NEXT: retq %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind diff --git a/llvm/test/CodeGen/X86/fma4-fneg-combine.ll b/llvm/test/CodeGen/X86/fma4-fneg-combine.ll index 771162a2c99..f29908678a7 100644 --- a/llvm/test/CodeGen/X86/fma4-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/fma4-fneg-combine.ll @@ -20,7 +20,8 @@ define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x float> %c) { define <4 x float> @test2(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: -; CHECK-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i) @@ -30,7 +31,8 @@ define <4 x float> @test2(<4 x float> %a, <4 x float> %b, <4 x float> %c) { define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: -; CHECK-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c) @@ -40,7 +42,8 @@ define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c) { define <4 x float> @test4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: -; CHECK-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %sub.i, <4 x float> %b, <4 x float> %c) @@ -50,7 +53,10 @@ define <4 x float> @test4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { define <4 x float> @test5(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] +; CHECK-NEXT: vxorps %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vxorps %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c @@ -72,7 +78,8 @@ define <2 x double> @test6(<2 x double> %a, <2 x double> %b, <2 x double> %c) { define <2 x double> @test7(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test7: ; CHECK: # %bb.0: -; CHECK-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %sub.i) @@ -82,7 +89,8 @@ define <2 x double> @test7(<2 x double> %a, <2 x double> %b, <2 x double> %c) { define <2 x double> @test8(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test8: ; CHECK: # %bb.0: -; CHECK-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %sub.i, <2 x double> %c) @@ -92,7 +100,8 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b, <2 x double> %c) { define <2 x double> @test9(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test9: ; CHECK: # %bb.0: -; CHECK-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c) @@ -102,7 +111,10 @@ define <2 x double> @test9(<2 x double> %a, <2 x double> %b, <2 x double> %c) { define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test10: ; CHECK: # %bb.0: -; CHECK-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [-0.000000e+00,-0.000000e+00] +; CHECK-NEXT: vxorpd %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vxorpd %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a %sub.i.2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c diff --git a/llvm/test/CodeGen/X86/fma4-scalar-memfold.ll b/llvm/test/CodeGen/X86/fma4-scalar-memfold.ll index 204f6f99b16..5d32278f788 100644 --- a/llvm/test/CodeGen/X86/fma4-scalar-memfold.ll +++ b/llvm/test/CodeGen/X86/fma4-scalar-memfold.ll @@ -63,7 +63,7 @@ define void @fmadd_aab_sd(double* %a, double* %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vfmaddsd (%rsi), %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovlpd %xmm0, (%rdi) +; CHECK-NEXT: vmovsd %xmm0, (%rdi) ; CHECK-NEXT: retq %a.val = load double, double* %a %av0 = insertelement <2 x double> undef, double %a.val, i32 0 @@ -85,7 +85,7 @@ define void @fmadd_aba_sd(double* %a, double* %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: vfmaddsd %xmm0, (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovlpd %xmm0, (%rdi) +; CHECK-NEXT: vmovsd %xmm0, (%rdi) ; CHECK-NEXT: retq %a.val = load double, double* %a %av0 = insertelement <2 x double> undef, double %a.val, i32 0 |