summaryrefslogtreecommitdiffstats
path: root/clang/lib/CodeGen
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2016-06-15 21:20:04 +0000
committerSanjay Patel <spatel@rotateright.com>2016-06-15 21:20:04 +0000
commit280cfd1a690468577fdffa8f15321be42eae201a (patch)
tree48f1412e6edfed1d7cf10c9f4f4f1fc2a44bda8d /clang/lib/CodeGen
parent062c26f1f0133f8dd5fb657eb1fffef3c1274be4 (diff)
downloadbcm5719-llvm-280cfd1a690468577fdffa8f15321be42eae201a.tar.gz
bcm5719-llvm-280cfd1a690468577fdffa8f15321be42eae201a.zip
[x86] translate SSE packed FP comparison builtins to IR
As noted in the code comment, a potential follow-on would be to remove the builtins themselves. Other than ord/unord, this already works as expected. Eg: typedef float v4sf __attribute__((__vector_size__(16))); v4sf fcmpgt(v4sf a, v4sf b) { return a > b; } Differential Revision: http://reviews.llvm.org/D21268 llvm-svn: 272840
Diffstat (limited to 'clang/lib/CodeGen')
-rw-r--r--clang/lib/CodeGen/CGBuiltin.cpp198
1 files changed, 74 insertions, 124 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c504e2e252f..b6938d4cab4 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6419,6 +6419,36 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), Result));
}
+ // These exist so that the builtin that takes an immediate can be bounds
+ // checked by clang to avoid passing bad immediates to the backend. Since
+ // AVX has a larger immediate than SSE we would need separate builtins to
+ // do the different bounds checking. Rather than create a clang specific
+ // SSE only builtin, this implements eight separate builtins to match gcc
+ // implementation.
+ auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
+ Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
+ llvm::Function *F = CGM.getIntrinsic(ID);
+ return Builder.CreateCall(F, Ops);
+ };
+
+ // For the vector forms of FP comparisons, translate the builtins directly to
+ // IR.
+ // TODO: The builtins could be removed if the SSE header files used vector
+ // extension comparisons directly (vector ordered/unordered may need
+ // additional support via __builtin_isnan()).
+ llvm::VectorType *V2F64 =
+ llvm::VectorType::get(llvm::Type::getDoubleTy(getLLVMContext()), 2);
+ llvm::VectorType *V4F32 =
+ llvm::VectorType::get(llvm::Type::getFloatTy(getLLVMContext()), 4);
+
+ auto getVectorFCmpIR = [this, &Ops](CmpInst::Predicate Pred,
+ llvm::VectorType *FPVecTy) {
+ Value *Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
+ llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
+ Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
+ return Builder.CreateBitCast(Sext, FPVecTy);
+ };
+
switch (BuiltinID) {
default: return nullptr;
case X86::BI__builtin_cpu_supports: {
@@ -6857,154 +6887,74 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Ops[0]);
return Builder.CreateExtractValue(Call, 1);
}
- // SSE comparison intrisics
+
+ // SSE packed comparison intrinsics
case X86::BI__builtin_ia32_cmpeqps:
+ return getVectorFCmpIR(CmpInst::FCMP_OEQ, V4F32);
case X86::BI__builtin_ia32_cmpltps:
+ return getVectorFCmpIR(CmpInst::FCMP_OLT, V4F32);
case X86::BI__builtin_ia32_cmpleps:
+ return getVectorFCmpIR(CmpInst::FCMP_OLE, V4F32);
case X86::BI__builtin_ia32_cmpunordps:
+ return getVectorFCmpIR(CmpInst::FCMP_UNO, V4F32);
case X86::BI__builtin_ia32_cmpneqps:
+ return getVectorFCmpIR(CmpInst::FCMP_UNE, V4F32);
case X86::BI__builtin_ia32_cmpnltps:
+ return getVectorFCmpIR(CmpInst::FCMP_UGE, V4F32);
case X86::BI__builtin_ia32_cmpnleps:
+ return getVectorFCmpIR(CmpInst::FCMP_UGT, V4F32);
case X86::BI__builtin_ia32_cmpordps:
- case X86::BI__builtin_ia32_cmpeqss:
- case X86::BI__builtin_ia32_cmpltss:
- case X86::BI__builtin_ia32_cmpless:
- case X86::BI__builtin_ia32_cmpunordss:
- case X86::BI__builtin_ia32_cmpneqss:
- case X86::BI__builtin_ia32_cmpnltss:
- case X86::BI__builtin_ia32_cmpnless:
- case X86::BI__builtin_ia32_cmpordss:
+ return getVectorFCmpIR(CmpInst::FCMP_ORD, V4F32);
case X86::BI__builtin_ia32_cmpeqpd:
+ return getVectorFCmpIR(CmpInst::FCMP_OEQ, V2F64);
case X86::BI__builtin_ia32_cmpltpd:
+ return getVectorFCmpIR(CmpInst::FCMP_OLT, V2F64);
case X86::BI__builtin_ia32_cmplepd:
+ return getVectorFCmpIR(CmpInst::FCMP_OLE, V2F64);
case X86::BI__builtin_ia32_cmpunordpd:
+ return getVectorFCmpIR(CmpInst::FCMP_UNO, V2F64);
case X86::BI__builtin_ia32_cmpneqpd:
+ return getVectorFCmpIR(CmpInst::FCMP_UNE, V2F64);
case X86::BI__builtin_ia32_cmpnltpd:
+ return getVectorFCmpIR(CmpInst::FCMP_UGE, V2F64);
case X86::BI__builtin_ia32_cmpnlepd:
+ return getVectorFCmpIR(CmpInst::FCMP_UGT, V2F64);
case X86::BI__builtin_ia32_cmpordpd:
+ return getVectorFCmpIR(CmpInst::FCMP_ORD, V2F64);
+
+ // SSE scalar comparison intrinsics
+ case X86::BI__builtin_ia32_cmpeqss:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
+ case X86::BI__builtin_ia32_cmpltss:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
+ case X86::BI__builtin_ia32_cmpless:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
+ case X86::BI__builtin_ia32_cmpunordss:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
+ case X86::BI__builtin_ia32_cmpneqss:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
+ case X86::BI__builtin_ia32_cmpnltss:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
+ case X86::BI__builtin_ia32_cmpnless:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
+ case X86::BI__builtin_ia32_cmpordss:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
case X86::BI__builtin_ia32_cmpeqsd:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
case X86::BI__builtin_ia32_cmpltsd:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
case X86::BI__builtin_ia32_cmplesd:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
case X86::BI__builtin_ia32_cmpunordsd:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
case X86::BI__builtin_ia32_cmpneqsd:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
case X86::BI__builtin_ia32_cmpnltsd:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
case X86::BI__builtin_ia32_cmpnlesd:
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
case X86::BI__builtin_ia32_cmpordsd:
- // These exist so that the builtin that takes an immediate can be bounds
- // checked by clang to avoid passing bad immediates to the backend. Since
- // AVX has a larger immediate than SSE we would need separate builtins to
- // do the different bounds checking. Rather than create a clang specific
- // SSE only builtin, this implements eight separate builtins to match gcc
- // implementation.
-
- // Choose the immediate.
- unsigned Imm;
- switch (BuiltinID) {
- default: llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_cmpeqps:
- case X86::BI__builtin_ia32_cmpeqss:
- case X86::BI__builtin_ia32_cmpeqpd:
- case X86::BI__builtin_ia32_cmpeqsd:
- Imm = 0;
- break;
- case X86::BI__builtin_ia32_cmpltps:
- case X86::BI__builtin_ia32_cmpltss:
- case X86::BI__builtin_ia32_cmpltpd:
- case X86::BI__builtin_ia32_cmpltsd:
- Imm = 1;
- break;
- case X86::BI__builtin_ia32_cmpleps:
- case X86::BI__builtin_ia32_cmpless:
- case X86::BI__builtin_ia32_cmplepd:
- case X86::BI__builtin_ia32_cmplesd:
- Imm = 2;
- break;
- case X86::BI__builtin_ia32_cmpunordps:
- case X86::BI__builtin_ia32_cmpunordss:
- case X86::BI__builtin_ia32_cmpunordpd:
- case X86::BI__builtin_ia32_cmpunordsd:
- Imm = 3;
- break;
- case X86::BI__builtin_ia32_cmpneqps:
- case X86::BI__builtin_ia32_cmpneqss:
- case X86::BI__builtin_ia32_cmpneqpd:
- case X86::BI__builtin_ia32_cmpneqsd:
- Imm = 4;
- break;
- case X86::BI__builtin_ia32_cmpnltps:
- case X86::BI__builtin_ia32_cmpnltss:
- case X86::BI__builtin_ia32_cmpnltpd:
- case X86::BI__builtin_ia32_cmpnltsd:
- Imm = 5;
- break;
- case X86::BI__builtin_ia32_cmpnleps:
- case X86::BI__builtin_ia32_cmpnless:
- case X86::BI__builtin_ia32_cmpnlepd:
- case X86::BI__builtin_ia32_cmpnlesd:
- Imm = 6;
- break;
- case X86::BI__builtin_ia32_cmpordps:
- case X86::BI__builtin_ia32_cmpordss:
- case X86::BI__builtin_ia32_cmpordpd:
- case X86::BI__builtin_ia32_cmpordsd:
- Imm = 7;
- break;
- }
-
- // Choose the intrinsic ID.
- const char *name;
- Intrinsic::ID ID;
- switch (BuiltinID) {
- default: llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_cmpeqps:
- case X86::BI__builtin_ia32_cmpltps:
- case X86::BI__builtin_ia32_cmpleps:
- case X86::BI__builtin_ia32_cmpunordps:
- case X86::BI__builtin_ia32_cmpneqps:
- case X86::BI__builtin_ia32_cmpnltps:
- case X86::BI__builtin_ia32_cmpnleps:
- case X86::BI__builtin_ia32_cmpordps:
- name = "cmpps";
- ID = Intrinsic::x86_sse_cmp_ps;
- break;
- case X86::BI__builtin_ia32_cmpeqss:
- case X86::BI__builtin_ia32_cmpltss:
- case X86::BI__builtin_ia32_cmpless:
- case X86::BI__builtin_ia32_cmpunordss:
- case X86::BI__builtin_ia32_cmpneqss:
- case X86::BI__builtin_ia32_cmpnltss:
- case X86::BI__builtin_ia32_cmpnless:
- case X86::BI__builtin_ia32_cmpordss:
- name = "cmpss";
- ID = Intrinsic::x86_sse_cmp_ss;
- break;
- case X86::BI__builtin_ia32_cmpeqpd:
- case X86::BI__builtin_ia32_cmpltpd:
- case X86::BI__builtin_ia32_cmplepd:
- case X86::BI__builtin_ia32_cmpunordpd:
- case X86::BI__builtin_ia32_cmpneqpd:
- case X86::BI__builtin_ia32_cmpnltpd:
- case X86::BI__builtin_ia32_cmpnlepd:
- case X86::BI__builtin_ia32_cmpordpd:
- name = "cmppd";
- ID = Intrinsic::x86_sse2_cmp_pd;
- break;
- case X86::BI__builtin_ia32_cmpeqsd:
- case X86::BI__builtin_ia32_cmpltsd:
- case X86::BI__builtin_ia32_cmplesd:
- case X86::BI__builtin_ia32_cmpunordsd:
- case X86::BI__builtin_ia32_cmpneqsd:
- case X86::BI__builtin_ia32_cmpnltsd:
- case X86::BI__builtin_ia32_cmpnlesd:
- case X86::BI__builtin_ia32_cmpordsd:
- name = "cmpsd";
- ID = Intrinsic::x86_sse2_cmp_sd;
- break;
- }
-
- Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
- llvm::Function *F = CGM.getIntrinsic(ID);
- return Builder.CreateCall(F, Ops, name);
+ return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
}
}
OpenPOWER on IntegriCloud