diff options
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 12 | ||||
-rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 10 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll | 97 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 45 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 28 |
6 files changed, 157 insertions, 50 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 1a3638c4293..22aa7318cc9 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -3723,18 +3723,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". //===----------------------------------------------------------------------===// // AVX512 -// Mask ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - // Mask instructions - // 16-bit mask - def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">, - Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], - [IntrNoMem]>; - def int_x86_avx512_kortestc_w : GCCBuiltin<"__builtin_ia32_kortestchi">, - Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty], - [IntrNoMem]>; -} - // Conversion ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_cvttss2si : GCCBuiltin<"__builtin_ia32_vcvttss2si32">, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index b4647eff7bf..d60591d52c5 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -115,6 +115,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name == "avx512.kor.w" || // Added in 7.0 Name == "avx512.kxor.w" || // Added in 7.0 Name == "avx512.kxnor.w" || // Added in 7.0 + Name == "avx512.kortestc.w" || // Added in 7.0 + Name == "avx512.kortestz.w" || // Added in 7.0 Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0 Name.startswith("avx2.pmax") || // Added in 3.9 Name.startswith("avx2.pmin") || // Added in 3.9 @@ -1156,6 +1158,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = getX86MaskVec(Builder, CI->getArgOperand(0), 16); Rep = Builder.CreateNot(Rep); Rep = Builder.CreateBitCast(Rep, CI->getType()); + } else if (IsX86 && + (Name == "avx512.kortestz.w" || Name == "avx512.kortestc.w")) { + Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16); + Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16); + Rep = Builder.CreateOr(LHS, RHS); + Rep = Builder.CreateBitCast(Rep, Builder.getInt16Ty()); + Value *C; + if (Name[14] == 'c') + C = ConstantInt::getAllOnesValue(Builder.getInt16Ty()); + else + C = ConstantInt::getNullValue(Builder.getInt16Ty()); + Rep = Builder.CreateICmpEQ(Rep, C); + Rep = Builder.CreateZExt(Rep, Builder.getInt32Ty()); } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) { Type *I32Ty = Type::getInt32Ty(C); Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 03d3228e44e..db315cc7a26 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20561,16 +20561,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - case Intrinsic::x86_avx512_kortestz_w: - case Intrinsic::x86_avx512_kortestc_w: { - X86::CondCode X86CC = - (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B; - SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); - SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); - SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); - SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); - } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index e5dce2d7248..9b0e30103f4 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -55,6 +55,103 @@ entry: ret i16 %13 } +define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { +; X32-LABEL: test_mm512_kortestc: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 +; X32-NEXT: korw %k0, %k1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: cmpw $-1, %ax +; X32-NEXT: sete %al +; X32-NEXT: andb $1, %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kortestc: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; X64-NEXT: korw %k0, %k1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: cmpw $-1, %ax +; X64-NEXT: sete %al +; X64-NEXT: andb $1, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__A to <16 x i32> + %1 = bitcast <8 x i64> %__B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <8 x i64> %__C to <16 x i32> + %4 = bitcast <8 x i64> %__D to <16 x i32> + %5 = icmp ne <16 x i32> %3, %4 + %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16 + %8 = icmp eq i16 %7, -1 + %9 = zext i1 %8 to i32 + ret i32 %9 +} + +define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { +; X32-LABEL: test_mm512_kortestz: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-64, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 +; X32-NEXT: korw %k0, %k1, %k0 +; X32-NEXT: kmovw %k0, %eax +; X32-NEXT: cmpw $0, %ax +; X32-NEXT: sete %al +; X32-NEXT: andb $1, %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_kortestz: +; X64: # %bb.0: # %entry +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; X64-NEXT: korw %k0, %k1, %k0 +; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: cmpw $0, %ax +; X64-NEXT: sete %al +; X64-NEXT: andb $1, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__A to <16 x i32> + %1 = bitcast <8 x i64> %__B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <8 x i64> %__C to <16 x i32> + %4 = bitcast <8 x i64> %__D to <16 x i32> + %5 = icmp ne <16 x i32> %3, %4 + %6 = or <16 x i1> %5, %2 + %7 = bitcast <16 x i1> %6 to i16 + %8 = icmp eq i16 %7, 0 + %9 = zext i1 %8 to i32 + ret i32 %9 +} + define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { ; X32-LABEL: test_mm512_shuffle_f32x4: ; X32: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 307691061bf..642c82728cf 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -3832,3 +3832,48 @@ define i16 @test_kxor(i16 %a0, i16 %a1) { ret i16 %t2 } +declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone +define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) { +; CHECK-LABEL: test_kortestz: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: kortestw %k1, %k0 +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %A to <16 x i32> + %1 = bitcast <8 x i64> %B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <8 x i64> %C to <16 x i32> + %4 = bitcast <8 x i64> %D to <16 x i32> + %5 = icmp ne <16 x i32> %3, %4 + %6 = bitcast <16 x i1> %2 to i16 + %7 = bitcast <16 x i1> %5 to i16 + %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone +define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) { +; CHECK-LABEL: test_kortestc: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: kortestw %k1, %k0 +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %A to <16 x i32> + %1 = bitcast <8 x i64> %B to <16 x i32> + %2 = icmp ne <16 x i32> %0, %1 + %3 = bitcast <8 x i64> %C to <16 x i32> + %4 = bitcast <8 x i64> %D to <16 x i32> + %5 = icmp ne <16 x i32> %3, %4 + %6 = bitcast <16 x i1> %2 to i16 + %7 = bitcast <16 x i1> %5 to i16 + %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7) + ret i32 %res +} diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 6c608ecddf3..74e91c38fc9 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1,34 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone -define i32 @test_kortestz(i16 %a0, i16 %a1) { -; CHECK-LABEL: test_kortestz: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %esi, %k0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kortestw %k0, %k1 -; CHECK-NEXT: sete %al -; CHECK-NEXT: retq - %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) - ret i32 %res -} - -declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone -define i32 @test_kortestc(i16 %a0, i16 %a1) { -; CHECK-LABEL: test_kortestc: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %esi, %k0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: kortestw %k0, %k1 -; CHECK-NEXT: setb %al -; CHECK-NEXT: retq - %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1) - ret i32 %res -} - define <16 x float> @test_rcp_ps_512(<16 x float> %a0) { ; CHECK-LABEL: test_rcp_ps_512: ; CHECK: ## %bb.0: |