diff options
Diffstat (limited to 'clang/lib/CodeGen')
-rw-r--r-- | clang/lib/CodeGen/CGBuiltin.cpp | 75 |
1 files changed, 58 insertions, 17 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3ebf584ac8e..a0afb086536 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -8718,6 +8718,47 @@ static Value *EmitX86FMAExpr(CodeGenFunction &CGF, ArrayRef<Value *> Ops, return Res; } +static Value * +EmitScalarFMAExpr(CodeGenFunction &CGF, MutableArrayRef<Value *> Ops, + Value *Upper, bool ZeroMask = false, unsigned PTIdx = 0, + bool NegAcc = false) { + unsigned Rnd = 4; + if (Ops.size() > 4) + Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue(); + + if (NegAcc) + Ops[2] = CGF.Builder.CreateFNeg(Ops[2]); + + Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0); + Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0); + Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0); + Value *Res; + if (Rnd != 4) { + Intrinsic::ID IID = Ops[0]->getType()->getPrimitiveSizeInBits() == 32 ? + Intrinsic::x86_avx512_vfmadd_f32 : + Intrinsic::x86_avx512_vfmadd_f64; + Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID), + {Ops[0], Ops[1], Ops[2], Ops[4]}); + } else { + Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType()); + Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3)); + } + // If we have more than 3 arguments, we need to do masking. + if (Ops.size() > 3) { + Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType()) + : Ops[PTIdx]; + + // If we negated the accumulator and the its the PassThru value we need to + // bypass the negate. Conveniently Upper should be the same thing in this + // case. + if (NegAcc && PTIdx == 2) + PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0); + + Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru); + } + return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0); +} + static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned, ArrayRef<Value *> Ops) { llvm::Type *Ty = Ops[0]->getType(); @@ -9141,24 +9182,24 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return EmitX86ConvertToMask(*this, Ops[0]); case X86::BI__builtin_ia32_vfmaddss3: - case X86::BI__builtin_ia32_vfmaddsd3: { - Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0); - Value *B = Builder.CreateExtractElement(Ops[1], (uint64_t)0); - Value *C = Builder.CreateExtractElement(Ops[2], (uint64_t)0); - Function *FMA = CGM.getIntrinsic(Intrinsic::fma, A->getType()); - Value *Res = Builder.CreateCall(FMA, {A, B, C} ); - return Builder.CreateInsertElement(Ops[0], Res, (uint64_t)0); - } + case X86::BI__builtin_ia32_vfmaddsd3: + case X86::BI__builtin_ia32_vfmaddss3_mask: + case X86::BI__builtin_ia32_vfmaddsd3_mask: + return EmitScalarFMAExpr(*this, Ops, Ops[0]); case X86::BI__builtin_ia32_vfmaddss: - case X86::BI__builtin_ia32_vfmaddsd: { - Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0); - Value *B = Builder.CreateExtractElement(Ops[1], (uint64_t)0); - Value *C = Builder.CreateExtractElement(Ops[2], (uint64_t)0); - Function *FMA = CGM.getIntrinsic(Intrinsic::fma, A->getType()); - Value *Res = Builder.CreateCall(FMA, {A, B, C} ); - Value *Zero = Constant::getNullValue(Ops[0]->getType()); - return Builder.CreateInsertElement(Zero, Res, (uint64_t)0); - } + case X86::BI__builtin_ia32_vfmaddsd: + return EmitScalarFMAExpr(*this, Ops, + Constant::getNullValue(Ops[0]->getType())); + case X86::BI__builtin_ia32_vfmaddss3_maskz: + case X86::BI__builtin_ia32_vfmaddsd3_maskz: + return EmitScalarFMAExpr(*this, Ops, Ops[0], /*ZeroMask*/true); + case X86::BI__builtin_ia32_vfmaddss3_mask3: + case X86::BI__builtin_ia32_vfmaddsd3_mask3: + return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2); + case X86::BI__builtin_ia32_vfmsubss3_mask3: + case X86::BI__builtin_ia32_vfmsubsd3_mask3: + return EmitScalarFMAExpr(*this, Ops, Ops[2], /*ZeroMask*/false, 2, + /*NegAcc*/true); case X86::BI__builtin_ia32_vfmaddps: case X86::BI__builtin_ia32_vfmaddpd: case X86::BI__builtin_ia32_vfmaddps256: |