diff options
| author | Jina Nahias <jina.nahias@intel.com> | 2017-09-19 11:03:06 +0000 | 
|---|---|---|
| committer | Jina Nahias <jina.nahias@intel.com> | 2017-09-19 11:03:06 +0000 | 
| commit | ccfb8d4fe87bbd8b6946395e2495e14ae716ee3d (patch) | |
| tree | 1bd7b2c3d5a8935b083559fb6f076ba3de08b581 | |
| parent | 3ad702a1ed5a20a4cb1e15f5940d8ecfe92533d6 (diff) | |
| download | bcm5719-llvm-ccfb8d4fe87bbd8b6946395e2495e14ae716ee3d.tar.gz bcm5719-llvm-ccfb8d4fe87bbd8b6946395e2495e14ae716ee3d.zip  | |
[x86] Lowering Mask Set1 intrinsics to LLVM IR
This patch, together with a matching clang patch (https://reviews.llvm.org/D37668), implements the lowering of X86 mask set1 intrinsics to IR.
Differential Revision: https://reviews.llvm.org/D37669
llvm-svn: 313625
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 59 | ||||
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll | 104 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 40 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 36 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll | 1507 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll | 62 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 65 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll | 189 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll | 84 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll | 78 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll | 201 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll | 92 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 79 | 
15 files changed, 2282 insertions, 345 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index fe630722755..a5f32d23cfc 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1871,65 +1871,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".                           llvm_v32i8_ty], [IntrNoMem]>;  } -// Vector load with broadcast -let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.". -  def int_x86_avx512_mask_pbroadcast_b_gpr_128 : -          GCCBuiltin<"__builtin_ia32_pbroadcastb128_gpr_mask">, -          Intrinsic<[llvm_v16i8_ty], -                    [llvm_i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_b_gpr_256 : -          GCCBuiltin<"__builtin_ia32_pbroadcastb256_gpr_mask">, -          Intrinsic<[llvm_v32i8_ty], -                    [llvm_i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_b_gpr_512 : -          GCCBuiltin<"__builtin_ia32_pbroadcastb512_gpr_mask">, -          Intrinsic<[llvm_v64i8_ty], -                    [llvm_i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - -  def int_x86_avx512_mask_pbroadcast_w_gpr_128 : -          GCCBuiltin<"__builtin_ia32_pbroadcastw128_gpr_mask">, -          Intrinsic<[llvm_v8i16_ty], -                    [llvm_i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_w_gpr_256 : -          GCCBuiltin<"__builtin_ia32_pbroadcastw256_gpr_mask">, -          Intrinsic<[llvm_v16i16_ty], -                    [llvm_i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_w_gpr_512 : -          GCCBuiltin<"__builtin_ia32_pbroadcastw512_gpr_mask">, -          Intrinsic<[llvm_v32i16_ty], -                    [llvm_i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - -  def int_x86_avx512_mask_pbroadcast_d_gpr_128 : -          GCCBuiltin<"__builtin_ia32_pbroadcastd128_gpr_mask">, -          Intrinsic<[llvm_v4i32_ty], -                    [llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_d_gpr_256 : -          GCCBuiltin<"__builtin_ia32_pbroadcastd256_gpr_mask">, -          Intrinsic<[llvm_v8i32_ty], -                    [llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_d_gpr_512 : -          GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">, -          Intrinsic<[llvm_v16i32_ty], -                    [llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - -  def int_x86_avx512_mask_pbroadcast_q_gpr_128 : -          GCCBuiltin<"__builtin_ia32_pbroadcastq128_gpr_mask">, -          Intrinsic<[llvm_v2i64_ty], -                    [llvm_i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_q_gpr_256 : -          GCCBuiltin<"__builtin_ia32_pbroadcastq256_gpr_mask">, -          Intrinsic<[llvm_v4i64_ty], -                    [llvm_i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_pbroadcast_q_gpr_512 : -          GCCBuiltin<"__builtin_ia32_pbroadcastq512_gpr_mask">, -          Intrinsic<[llvm_v8i64_ty], -                    [llvm_i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - -  def int_x86_avx512_mask_pbroadcast_q_mem_512 : -          GCCBuiltin<"__builtin_ia32_pbroadcastq512_mem_mask">, -          Intrinsic<[llvm_v8i64_ty], -                    [llvm_i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; -}  // Vector permutation  let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.". diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 2e8a41e06e4..b1c8e74084f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -77,6 +77,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {        Name=="ssse3.pabs.d.128" || // Added in 6.0        Name.startswith("avx2.pabs.") || // Added in 6.0        Name.startswith("avx512.mask.pabs.") || // Added in 6.0 +      Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0        Name.startswith("sse2.pcmpeq.") || // Added in 3.1        Name.startswith("sse2.pcmpgt.") || // Added in 3.1        Name.startswith("avx2.pcmpeq.") || // Added in 3.1 @@ -1031,6 +1032,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {        Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,                                 CI->getArgOperand(0), CI->getArgOperand(1));        Rep = Builder.CreateSExt(Rep, CI->getType(), ""); +    } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){ +      unsigned NumElts = +          CI->getArgOperand(1)->getType()->getVectorNumElements(); +      Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); +      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, +                          CI->getArgOperand(1));      } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {        Type *I32Ty = Type::getInt32Ty(C);        Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 4f1b6572875..d9d0b06c960 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -797,30 +797,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {    X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),    X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),    X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0), -  X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK, -                     X86ISD::VBROADCAST, 0),    X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,                       X86ISD::VPERMV, 0),    X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK, diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 65e655434c4..deef8ba80b6 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -4,6 +4,110 @@  ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c +define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) { +; X32-LABEL: test_mm512_mask_set1_epi32: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovw %ecx, %k1 +; X32-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_mask_set1_epi32: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 +  %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer +  %0 = bitcast <8 x i64> %__O to <16 x i32> +  %1 = bitcast i16 %__M to <16 x i1> +  %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0 +  %3 = bitcast <16 x i32> %2 to <8 x i64> +  ret <8 x i64> %3 +} + +define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A)  { +; X32-LABEL: test_mm512_maskz_set1_epi32: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovw %ecx, %k1 +; X32-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_maskz_set1_epi32: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 +  %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer +  %0 = bitcast i16 %__M to <16 x i1> +  %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer +  %2 = bitcast <16 x i32> %1 to <8 x i64> +  ret <8 x i64> %2 +} + +define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) { +; X32-LABEL: test_mm512_mask_set1_epi64: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    vmovd %edx, %xmm1 +; X32-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1 +; X32-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 +; X32-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1 +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_mask_set1_epi64: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 +  %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer +  %0 = bitcast i8 %__M to <8 x i1> +  %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O +  ret <8 x i64> %1 +} + +define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  { +; X32-LABEL: test_mm512_maskz_set1_epi64: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    vmovd %edx, %xmm0 +; X32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0 +; X32-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 +; X32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_maskz_set1_epi64: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 +  %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer +  %0 = bitcast i8 %__M to <8 x i1> +  %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer +  ret <8 x i64> %1 +} + +  define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {  ; X32-LABEL: test_mm512_broadcastd_epi32:  ; X32:       # BB#0: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index f401cac82da..8d653c78182 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,6 +1,46 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastd %edi, %zmm1 +; CHECK-NEXT:    kmovw %esi, %k1 +; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1} +; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT:    vpbroadcastd %edi, %zmm1 {%k1} {z} +; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT:    retq +    %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) +    %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) +    %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) +    %res3 = add <16 x i32> %res, %res1 +    %res4 = add <16 x i32> %res2, %res3 +    ret <16 x i32> %res4 +  } +declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) + + +define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1 +; CHECK-NEXT:    kmovw %esi, %k1 +; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} +; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z} +; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT:    retq +   %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) +   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) +   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) +   %res3 = add <8 x i64> %res, %res1 +   %res4 = add <8 x i64> %res2, %res3 +   ret <8 x i64> %res4 +} +declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) + +  declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly  define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index ffbe5b29b48..4816e10de58 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -4171,44 +4171,8 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2    ret i8 %res2  } -define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovw %esi, %k1 -; CHECK-NEXT:    vpbroadcastd %edi, %zmm1 {%k1} {z} -; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1} -; CHECK-NEXT:    vpbroadcastd %edi, %zmm2 -; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT:    retq -  %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) -  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) -  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) -  %res3 = add <16 x i32> %res, %res1 -  %res4 = add <16 x i32> %res2, %res3 -  ret <16 x i32> %res4 -} -declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) -define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovw %esi, %k1 -; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z} -; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} -; CHECK-NEXT:    vpbroadcastq %rdi, %zmm2 -; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT:    retq -  %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) -  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) -  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) -  %res3 = add <8 x i64> %res, %res1 -  %res4 = add <8 x i64> %res2, %res3 -  ret <8 x i64> %res4 -} -declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)  declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll index 50a9076163e..184e152a9e4 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -4,6 +4,1513 @@  ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c +define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  { +; X32-LABEL: test_mm512_mask_set1_epi8: +; X32:       # BB#0: # %entry +; X32-NEXT:    pushl %ebx +; X32-NEXT:  .Lcfi0: +; X32-NEXT:    .cfi_def_cfa_offset 8 +; X32-NEXT:  .Lcfi1: +; X32-NEXT:    .cfi_offset %ebx, -8 +; X32-NEXT:    vmovdqa64 %zmm0, %zmm3 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpsllw $8, %xmm1, %xmm1 +; X32-NEXT:    kmovd %eax, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastw %xmm2, %xmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpslld $24, %xmm2, %xmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $4, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastd %xmm2, %xmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $5, %cl +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpsllq $40, %xmm2, %xmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $6, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastw %xmm2, %xmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpsllq $56, %xmm1, %xmm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastq %xmm1, %xmm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6] +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $61440, %ecx # imm = 0xF000 +; X32-NEXT:    shrl $12, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastd %xmm1, %xmm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $13, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2] +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $49152, %ecx # imm = 0xC000 +; X32-NEXT:    shrl $14, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $32768, %ecx # imm = 0x8000 +; X32-NEXT:    shrl $15, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $16, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpsllw $8, %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $15, %dl +; X32-NEXT:    movl %edx, %ebx +; X32-NEXT:    shrb $2, %bl +; X32-NEXT:    kmovd %ebx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    shrb $3, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslld $24, %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $4, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastd %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $5, %dl +; X32-NEXT:    andb $1, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpsllq $40, %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $6, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpsllq $56, %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $24, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastq %xmm1, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6] +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm5, %ymm2, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $28, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpbroadcastd %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; X32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm1, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $29, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm1 +; X32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2] +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; X32-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $30, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; X32-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrl $31, %eax +; X32-NEXT:    kmovd %eax, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; X32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    kmovd %eax, %k1 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    vpmovm2b %k1, %zmm7 +; X32-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm6, %ymm1, %ymm7, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllw $8, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslld $24, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $4, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $5, %cl +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $40, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $6, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $56, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastq %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $61440, %ecx # imm = 0xF000 +; X32-NEXT:    shrl $12, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $13, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $49152, %ecx # imm = 0xC000 +; X32-NEXT:    shrl $14, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $32768, %ecx # imm = 0x8000 +; X32-NEXT:    shrl $15, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $16, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllw $8, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $15, %dl +; X32-NEXT:    movl %edx, %ebx +; X32-NEXT:    shrb $2, %bl +; X32-NEXT:    kmovd %ebx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslld $24, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $4, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $5, %dl +; X32-NEXT:    andb $1, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $40, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $6, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $56, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $24, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastq %xmm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm6 +; X32-NEXT:    vpblendvb %ymm5, %ymm6, %ymm0, %ymm0 +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm5 +; X32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm1, %ymm2 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $29, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k2 +; X32-NEXT:    vpmovm2b %k2, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $28, %ecx +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastd %xmm2, %xmm2 +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    vpmovm2b %k1, %zmm2 +; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $30, %ecx +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastw %xmm2, %xmm2 +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrl $31, %eax +; X32-NEXT:    kmovd %eax, %k1 +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kshiftlq $1, %k0, %k0 +; X32-NEXT:    kshiftrq $1, %k0, %k0 +; X32-NEXT:    kshiftlq $63, %k1, %k1 +; X32-NEXT:    korq %k1, %k0, %k1 +; X32-NEXT:    vpbroadcastb %eax, %zmm3 {%k1} +; X32-NEXT:    vmovdqa64 %zmm3, %zmm0 +; X32-NEXT:    popl %ebx +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_mask_set1_epi8: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovq %rdi, %k1 +; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1} +; X64-NEXT:    retq +  entry: +  %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0 +  %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer +  %0 = bitcast <8 x i64> %__O to <64 x i8> +  %1 = bitcast i64 %__M to <64 x i1> +  %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0 +  %3 = bitcast <64 x i8> %2 to <8 x i64> +  ret <8 x i64> %3 +} + +define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  { +; X32-LABEL: test_mm512_maskz_set1_epi8: +; X32:       # BB#0: # %entry +; X32-NEXT:    pushl %ebx +; X32-NEXT:  .Lcfi2: +; X32-NEXT:    .cfi_def_cfa_offset 8 +; X32-NEXT:  .Lcfi3: +; X32-NEXT:    .cfi_offset %ebx, -8 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vpsllw $8, %xmm0, %xmm0 +; X32-NEXT:    kmovd %eax, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpslld $24, %xmm1, %xmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $4, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpbroadcastd %xmm1, %xmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $5, %cl +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpsllq $40, %xmm1, %xmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $6, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpbroadcastw %xmm1, %xmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vpsllq $56, %xmm1, %xmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastq %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $61440, %ecx # imm = 0xF000 +; X32-NEXT:    shrl $12, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $13, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $49152, %ecx # imm = 0xC000 +; X32-NEXT:    shrl $14, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $32768, %ecx # imm = 0x8000 +; X32-NEXT:    shrl $15, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $16, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllw $8, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $15, %dl +; X32-NEXT:    movl %edx, %ebx +; X32-NEXT:    shrb $2, %bl +; X32-NEXT:    kmovd %ebx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslld $24, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $4, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $5, %dl +; X32-NEXT:    andb $1, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $40, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $6, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $56, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $24, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastq %xmm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $28, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; X32-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $29, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; X32-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $30, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; X32-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm1 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrl $31, %eax +; X32-NEXT:    kmovd %eax, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; X32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; X32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    kmovd %eax, %k1 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    vpmovm2b %k1, %zmm7 +; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllw $8, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslld $24, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $4, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $5, %cl +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $40, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $6, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $56, %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastq %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    andb $2, %cl +; X32-NEXT:    shrb %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movb %ah, %cl +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $61440, %ecx # imm = 0xF000 +; X32-NEXT:    shrl $12, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $13, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $49152, %ecx # imm = 0xC000 +; X32-NEXT:    shrl $14, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    andl $32768, %ecx # imm = 0x8000 +; X32-NEXT:    shrl $15, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $16, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllw $8, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $15, %dl +; X32-NEXT:    movl %edx, %ebx +; X32-NEXT:    shrb $2, %bl +; X32-NEXT:    kmovd %ebx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $3, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpslld $24, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $4, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $5, %dl +; X32-NEXT:    andb $1, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $40, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $6, %dl +; X32-NEXT:    kmovd %edx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    shrb $7, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpsllq $56, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $24, %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpmovm2b %k1, %zmm0 +; X32-NEXT:    vpbroadcastq %xmm0, %ymm0 +; X32-NEXT:    vpmovm2b %k0, %zmm1 +; X32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4 +; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; X32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    andb $2, %dl +; X32-NEXT:    shrb %dl +; X32-NEXT:    kmovd %edx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    vextracti64x4 $1, %zmm2, %ymm1 +; X32-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 +; X32-NEXT:    andb $15, %cl +; X32-NEXT:    movl %ecx, %edx +; X32-NEXT:    shrb $2, %dl +; X32-NEXT:    kmovd %edx, %k0 +; X32-NEXT:    shrb $3, %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $29, %ecx +; X32-NEXT:    andb $1, %cl +; X32-NEXT:    kmovd %ecx, %k2 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k3 +; X32-NEXT:    vpmovm2b %k3, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastw %xmm2, %xmm2 +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    vpmovm2b %k1, %zmm2 +; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $28, %ecx +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastd %xmm2, %xmm2 +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    vpmovm2b %k2, %zmm2 +; X32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; X32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    vpmovb2m %zmm0, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm0 +; X32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; X32-NEXT:    movl %eax, %ecx +; X32-NEXT:    shrl $30, %ecx +; X32-NEXT:    kmovd %ecx, %k0 +; X32-NEXT:    vpmovm2b %k0, %zmm2 +; X32-NEXT:    vpbroadcastw %xmm2, %xmm2 +; X32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2 +; X32-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; X32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; X32-NEXT:    shrl $31, %eax +; X32-NEXT:    kmovd %eax, %k0 +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    vpmovb2m %zmm0, %k1 +; X32-NEXT:    kshiftlq $1, %k1, %k1 +; X32-NEXT:    kshiftrq $1, %k1, %k1 +; X32-NEXT:    kshiftlq $63, %k0, %k0 +; X32-NEXT:    korq %k0, %k1, %k1 +; X32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} {z} +; X32-NEXT:    popl %ebx +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_maskz_set1_epi8: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovq %rdi, %k1 +; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1} {z} +; X64-NEXT:    retq +  entry: +  %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0 +  %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer +  %0 = bitcast i64 %__M to <64 x i1> +  %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer +  %2 = bitcast <64 x i8> %1 to <8 x i64> +  ret <8 x i64> %2 +} + +define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A)  { +; X32-LABEL: test_mm512_mask_set1_epi16: +; X32:       # BB#0: # %entry +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_mask_set1_epi16: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1} +; X64-NEXT:    retq +  entry: +  %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0 +  %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer +  %0 = bitcast <8 x i64> %__O to <32 x i16> +  %1 = bitcast i32 %__M to <32 x i1> +  %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0 +  %3 = bitcast <32 x i16> %2 to <8 x i64> +  ret <8 x i64> %3 +} + +define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A)  { +; X32-LABEL: test_mm512_maskz_set1_epi16: +; X32:       # BB#0: # %entry +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm512_maskz_set1_epi16: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1} {z} +; X64-NEXT:    retq +  entry: +  %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0 +  %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer +  %0 = bitcast i32 %__M to <32 x i1> +  %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer +  %2 = bitcast <32 x i16> %1 to <8 x i64> +  ret <8 x i64> %2 +} +  define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {  ; X32-LABEL: test_mm512_broadcastb_epi8:  ; X32:       # BB#0: diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index cb0061188ae..ec9481568b7 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2,6 +2,68 @@  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW  ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 + +declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) + +  define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: +; AVX512BW:       ## BB#0: +; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm1 +; AVX512BW-NEXT:    kmovq %rsi, %k1 +; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} +; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm1 {%k1} {z} +; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT:    retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: +; AVX512F-32:       # BB#0: +; AVX512F-32-NEXT:    movb {{[0-9]+}}(%esp), %al +; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm1 +; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} +; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm1 {%k1} {z} +; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT:    retl +    %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) +    %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) +    %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) +    %res3 = add <64 x i8> %res, %res1 +    %res4 = add <64 x i8> %res2, %res3 +    ret <64 x i8> %res4 +  } + +declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) +  define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: +; AVX512BW:       ## BB#0: +; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm1 +; AVX512BW-NEXT:    kmovd %esi, %k1 +; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} +; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm1 {%k1} {z} +; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT:    retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: +; AVX512F-32:       # BB#0: +; AVX512F-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm1 +; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} +; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm1 {%k1} {z} +; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT:    retl +    %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) +    %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) +   %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) +    %res3 = add <32 x i16> %res, %res1 +   %res4 = add <32 x i16> %res2, %res3 +    ret <32 x i16> %res4 + } +  declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)  define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) { diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index e9962a370af..f499ae50f54 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1815,71 +1815,6 @@ define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32    ret i32 %res2  } -declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) - -define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { -; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: -; AVX512BW:       ## BB#0: -; AVX512BW-NEXT:    kmovq %rsi, %k1 -; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm1 {%k1} {z} -; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} -; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm2 -; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT:    retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: -; AVX512F-32:       # BB#0: -; AVX512F-32-NEXT:    movb {{[0-9]+}}(%esp), %al -; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0 -; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1 -; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm1 {%k1} {z} -; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} -; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm2 -; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 -; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT:    retl -  %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) -  %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) -  %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) -  %res3 = add <64 x i8> %res, %res1 -  %res4 = add <64 x i8> %res2, %res3 -  ret <64 x i8> %res4 -} - -declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) - -define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { -; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: -; AVX512BW:       ## BB#0: -; AVX512BW-NEXT:    kmovd %esi, %k1 -; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm1 {%k1} {z} -; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} -; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm2 -; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT:    retq -; -; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: -; AVX512F-32:       # BB#0: -; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm1 {%k1} {z} -; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} -; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm2 -; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 -; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT:    retl -  %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) -  %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) -  %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) -  %res3 = add <32 x i16> %res, %res1 -  %res4 = add <32 x i16> %res2, %res3 -  ret <32 x i16> %res4 -} - -  define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) {  ; AVX512BW-LABEL: test_x86_avx512_psll_w_512:  ; AVX512BW:       ## BB#0: diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll index a911f6465e6..4edfb539839 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll @@ -4,6 +4,195 @@  ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlbw-builtins.c +define <2 x i64> @test_mm_mask_set1_epi8(<2 x i64> %__O, i16 zeroext %__M, i8 signext %__A) local_unnamed_addr #0 { +; X32-LABEL: test_mm_mask_set1_epi8: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastb %eax, %xmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_mask_set1_epi8: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastb %esi, %xmm0 {%k1} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <16 x i8> undef, i8 %__A, i32 0 +  %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer +  %0 = bitcast <2 x i64> %__O to <16 x i8> +  %1 = bitcast i16 %__M to <16 x i1> +  %2 = select <16 x i1> %1, <16 x i8> %vecinit15.i.i, <16 x i8> %0 +  %3 = bitcast <16 x i8> %2 to <2 x i64> +  ret <2 x i64> %3 +} + +define <2 x i64> @test_mm_maskz_set1_epi8(i16 zeroext %__M, i8 signext %__A)  { +; X32-LABEL: test_mm_maskz_set1_epi8: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastb %eax, %xmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_maskz_set1_epi8: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastb %esi, %xmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <16 x i8> undef, i8 %__A, i32 0 +  %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer +  %0 = bitcast i16 %__M to <16 x i1> +  %1 = select <16 x i1> %0, <16 x i8> %vecinit15.i.i, <16 x i8> zeroinitializer +  %2 = bitcast <16 x i8> %1 to <2 x i64> +  ret <2 x i64> %2 +} + +define <4 x i64> @test_mm256_mask_set1_epi8(<4 x i64> %__O, i32 %__M, i8 signext %__A){ +; X32-LABEL: test_mm256_mask_set1_epi8: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastb %eax, %ymm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_mask_set1_epi8: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastb %esi, %ymm0 {%k1} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <32 x i8> undef, i8 %__A, i32 0 +  %vecinit31.i.i = shufflevector <32 x i8> %vecinit.i.i, <32 x i8> undef, <32 x i32> zeroinitializer +  %0 = bitcast <4 x i64> %__O to <32 x i8> +  %1 = bitcast i32 %__M to <32 x i1> +  %2 = select <32 x i1> %1, <32 x i8> %vecinit31.i.i, <32 x i8> %0 +  %3 = bitcast <32 x i8> %2 to <4 x i64> +  ret <4 x i64> %3 +} + +define <4 x i64> @test_mm256_maskz_set1_epi8(i32 %__M, i8 signext %__A)  { +; X32-LABEL: test_mm256_maskz_set1_epi8: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastb %eax, %ymm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_maskz_set1_epi8: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastb %esi, %ymm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <32 x i8> undef, i8 %__A, i32 0 +  %vecinit31.i.i = shufflevector <32 x i8> %vecinit.i.i, <32 x i8> undef, <32 x i32> zeroinitializer +  %0 = bitcast i32 %__M to <32 x i1> +  %1 = select <32 x i1> %0, <32 x i8> %vecinit31.i.i, <32 x i8> zeroinitializer +  %2 = bitcast <32 x i8> %1 to <4 x i64> +  ret <4 x i64> %2 +} + +define <4 x i64> @test_mm256_mask_set1_epi16(<4 x i64> %__O, i16 zeroext %__M, i16 signext %__A)  { +; X32-LABEL: test_mm256_mask_set1_epi16: +; X32:       # BB#0: # %entry +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastw %eax, %ymm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_mask_set1_epi16: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastw %esi, %ymm0 {%k1} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <16 x i16> undef, i16 %__A, i32 0 +  %vecinit15.i.i = shufflevector <16 x i16> %vecinit.i.i, <16 x i16> undef, <16 x i32> zeroinitializer +  %0 = bitcast <4 x i64> %__O to <16 x i16> +  %1 = bitcast i16 %__M to <16 x i1> +  %2 = select <16 x i1> %1, <16 x i16> %vecinit15.i.i, <16 x i16> %0 +  %3 = bitcast <16 x i16> %2 to <4 x i64> +  ret <4 x i64> %3 +} + +define <4 x i64> @test_mm256_maskz_set1_epi16(i16 zeroext %__M, i16 signext %__A) { +; X32-LABEL: test_mm256_maskz_set1_epi16: +; X32:       # BB#0: # %entry +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastw %eax, %ymm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_maskz_set1_epi16: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastw %esi, %ymm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <16 x i16> undef, i16 %__A, i32 0 +  %vecinit15.i.i = shufflevector <16 x i16> %vecinit.i.i, <16 x i16> undef, <16 x i32> zeroinitializer +  %0 = bitcast i16 %__M to <16 x i1> +  %1 = select <16 x i1> %0, <16 x i16> %vecinit15.i.i, <16 x i16> zeroinitializer +  %2 = bitcast <16 x i16> %1 to <4 x i64> +  ret <4 x i64> %2 +} + +define <2 x i64> @test_mm_mask_set1_epi16(<2 x i64> %__O, i8 zeroext %__M, i16 signext %__A) { +; X32-LABEL: test_mm_mask_set1_epi16: +; X32:       # BB#0: # %entry +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastw %eax, %xmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_mask_set1_epi16: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastw %esi, %xmm0 {%k1} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <8 x i16> undef, i16 %__A, i32 0 +  %vecinit7.i.i = shufflevector <8 x i16> %vecinit.i.i, <8 x i16> undef, <8 x i32> zeroinitializer +  %0 = bitcast <2 x i64> %__O to <8 x i16> +  %1 = bitcast i8 %__M to <8 x i1> +  %2 = select <8 x i1> %1, <8 x i16> %vecinit7.i.i, <8 x i16> %0 +  %3 = bitcast <8 x i16> %2 to <2 x i64> +  ret <2 x i64> %3 +} + +define <2 x i64> @test_mm_maskz_set1_epi16(i8 zeroext %__M, i16 signext %__A) { +; X32-LABEL: test_mm_maskz_set1_epi16: +; X32:       # BB#0: # %entry +; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl +; X32-NEXT:    kmovd %ecx, %k1 +; X32-NEXT:    vpbroadcastw %eax, %xmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_maskz_set1_epi16: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovd %edi, %k1 +; X64-NEXT:    vpbroadcastw %esi, %xmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <8 x i16> undef, i16 %__A, i32 0 +  %vecinit7.i.i = shufflevector <8 x i16> %vecinit.i.i, <8 x i16> undef, <8 x i32> zeroinitializer +  %0 = bitcast i8 %__M to <8 x i1> +  %1 = select <8 x i1> %0, <8 x i16> %vecinit7.i.i, <8 x i16> zeroinitializer +  %2 = bitcast <8 x i16> %1 to <2 x i64> +  ret <2 x i64> %2 +} + +  define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {  ; X32-LABEL: test_mm_broadcastb_epi8:  ; X32:       # BB#0: diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index beac8b81bc5..16497701c90 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -1,6 +1,90 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s +declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastb %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf] +; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7] +; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; CHECK-NEXT:    vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf] +; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +  %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) +  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) +  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) +  %res3 = add <16 x i8> %res, %res1 +  %res4 = add <16 x i8> %res2, %res3 +  ret <16 x i8> %res4 +} + + +declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastw %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf] +; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7] +; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; CHECK-NEXT:    vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf] +; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +  %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) +  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) +  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) +  %res3 = add <8 x i16> %res, %res1 +  %res4 = add <8 x i16> %res2, %res3 +  ret <8 x i16> %res4 +} + + + declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) + +  define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastb %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf] +; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7] +; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; CHECK-NEXT:    vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf] +; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +    %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) +    %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) +    %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) +    %res3 = add <32 x i8> %res, %res1 +    %res4 = add <32 x i8> %res2, %res3 +    ret <32 x i8> %res4 +  } + + + +declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) + +  define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastw %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf] +; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7] +; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT:    vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf] +; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +    %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) +    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) +    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) +    %res3 = add <16 x i16> %res, %res1 +    %res4 = add <16 x i16> %res2, %res3 +    ret <16 x i16> %res4 +  } +  declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)  define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 68fb257eaca..e2472bc4433 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -2665,82 +2665,4 @@ define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16    ret i16 %res2  } -declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) -define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT:    vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf] -; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7] -; CHECK-NEXT:    vpbroadcastb %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7] -; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] -; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) -  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) -  %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) -  %res3 = add <32 x i8> %res, %res1 -  %res4 = add <32 x i8> %res2, %res3 -  ret <32 x i8> %res4 -} - -declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) - -define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT:    vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf] -; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7] -; CHECK-NEXT:    vpbroadcastb %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7] -; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] -; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) -  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) -  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) -  %res3 = add <16 x i8> %res, %res1 -  %res4 = add <16 x i8> %res2, %res3 -  ret <16 x i8> %res4 -} - -declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) - -define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT:    vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf] -; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7] -; CHECK-NEXT:    vpbroadcastw %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7] -; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] -; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) -  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) -  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) -  %res3 = add <16 x i16> %res, %res1 -  %res4 = add <16 x i16> %res2, %res3 -  ret <16 x i16> %res4 -} - -declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) - -define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] -; CHECK-NEXT:    vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf] -; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7] -; CHECK-NEXT:    vpbroadcastw %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7] -; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] -; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) -  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) -  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) -  %res3 = add <8 x i16> %res, %res1 -  %res4 = add <8 x i16> %res2, %res3 -  ret <8 x i16> %res4 -} diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index f517e3b0d5d..7d7964d7bc9 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -4,6 +4,207 @@  ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c +define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M)  { +; X32-LABEL: test_mm_mask_set1_epi32: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_mask_set1_epi32: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} +; X64-NEXT:    retq +entry: +  %0 = bitcast <2 x i64> %__O to <4 x i32> +  %1 = bitcast i8 %__M to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0 +  %3 = bitcast <4 x i32> %2 to <2 x i64> +  ret <2 x i64> %3 +} + +define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) { +; X32-LABEL: test_mm_maskz_set1_epi32: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_maskz_set1_epi32: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %0 = bitcast i8 %__M to <8 x i1> +  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer +  %2 = bitcast <4 x i32> %1 to <2 x i64> +  ret <2 x i64> %2 +} + +define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M)  { +; X32-LABEL: test_mm256_mask_set1_epi32: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_mask_set1_epi32: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} +; X64-NEXT:    retq +entry: +  %0 = bitcast <4 x i64> %__O to <8 x i32> +  %1 = bitcast i8 %__M to <8 x i1> +  %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0 +  %3 = bitcast <8 x i32> %2 to <4 x i64> +  ret <4 x i64> %3 +} + +define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M)  { +; X32-LABEL: test_mm256_maskz_set1_epi32: +; X32:       # BB#0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_maskz_set1_epi32: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %0 = bitcast i8 %__M to <8 x i1> +  %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer +  %2 = bitcast <8 x i32> %1 to <4 x i64> +  ret <4 x i64> %2 +} + +define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A)  { +; X32-LABEL: test_mm_mask_set1_epi64: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl +; X32-NEXT:    vmovd %eax, %xmm1 +; X32-NEXT:    vpbroadcastb %xmm1, %xmm1 +; X32-NEXT:    kmovw %ecx, %k1 +; X32-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_mask_set1_epi64: +; X64:       # BB#0: # %entry +; X64-NEXT:    vmovd %esi, %xmm1 +; X64-NEXT:    vpbroadcastb %xmm1, %xmm1 +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} +; X64-NEXT:    retq +entry: +  %conv.i = trunc i64 %__A to i8 +  %vecinit.i.i = insertelement <16 x i8> undef, i8 %conv.i, i32 0 +  %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer +  %0 = bitcast <16 x i8> %vecinit15.i.i to <2 x i64> +  %1 = bitcast i8 %__M to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> +  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__O +  ret <2 x i64> %2 +} + +define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  { +; X32-LABEL: test_mm_maskz_set1_epi64: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl +; X32-NEXT:    vmovd %eax, %xmm0 +; X32-NEXT:    vpbroadcastb %xmm0, %xmm0 +; X32-NEXT:    kmovw %ecx, %k1 +; X32-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_maskz_set1_epi64: +; X64:       # BB#0: # %entry +; X64-NEXT:    vmovd %esi, %xmm0 +; X64-NEXT:    vpbroadcastb %xmm0, %xmm0 +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %conv.i = trunc i64 %__A to i8 +  %vecinit.i.i = insertelement <16 x i8> undef, i8 %conv.i, i32 0 +  %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer +  %0 = bitcast <16 x i8> %vecinit15.i.i to <2 x i64> +  %1 = bitcast i8 %__M to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> +  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer +  ret <2 x i64> %2 +} + + +define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) { +; X32-LABEL: test_mm256_mask_set1_epi64: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movb {{[0-9]+}}(%esp), %dl +; X32-NEXT:    vmovd %ecx, %xmm1 +; X32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1 +; X32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1 +; X32-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1 +; X32-NEXT:    kmovw %edx, %k1 +; X32-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_mask_set1_epi64: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 +  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer +  %0 = bitcast i8 %__M to <8 x i1> +  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O +  ret <4 x i64> %1 +} + +define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  { +; X32-LABEL: test_mm256_maskz_set1_epi64: +; X32:       # BB#0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movb {{[0-9]+}}(%esp), %dl +; X32-NEXT:    vmovd %ecx, %xmm0 +; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0 +; X32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT:    kmovw %edx, %k1 +; X32-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_maskz_set1_epi64: +; X64:       # BB#0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vpbroadcastq %rsi, %ymm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 +  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer +  %0 = bitcast i8 %__M to <8 x i1> +  %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer +  ret <4 x i64> %1 +} +  define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {  ; X32-LABEL: test_mm_broadcastd_epi32:  ; X32:       # BB#0: diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 948142d6fb8..992c4a10887 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -1,6 +1,90 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s +declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastd %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xcf] +; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7] +; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT:    vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf] +; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +  %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) +  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) +  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) +  %res3 = add <4 x i32> %res, %res1 +  %res4 = add <4 x i32> %res2, %res3 +  ret <4 x i32> %res4 +} + + +declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastq %rdi, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xcf] +; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7] +; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT:    vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf] +; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +  %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) +  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) +  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) +  %res3 = add <2 x i64> %res, %res1 +  %res4 = add <2 x i64> %res2, %res3 +  ret <2 x i64> %res4 +} + + + declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) + +  define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastd %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xcf] +; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7] +; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; CHECK-NEXT:    vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf] +; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +    %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) +    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) +    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) +    %res3 = add <8 x i32> %res, %res1 +    %res4 = add <8 x i32> %res2, %res3 +    ret <8 x i32> %res4 +  } + + declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) + +  define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { +; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: +; CHECK:       ## BB#0: +; CHECK-NEXT:    vpbroadcastq %rdi, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xcf] +; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7] +; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT:    vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf] +; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT:    retq ## encoding: [0xc3] +    %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) +    %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) +    %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) +    %res3 = add <4 x i64> %res, %res1 +    %res4 = add <4 x i64> %res2, %res3 +    ret <4 x i64> %res4 +  } + + +  declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)  define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) { @@ -3925,9 +4009,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() {  ; CHECK:       ## BB#0:  ; CHECK-NEXT:    vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]  ; CHECK-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; CHECK-NEXT:    ## fixup A - offset: 4, value: LCPI276_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT:    ## fixup A - offset: 4, value: LCPI280_0-4, kind: reloc_riprel_4byte  ; CHECK-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; CHECK-NEXT:    ## fixup A - offset: 5, value: LCPI276_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT:    ## fixup A - offset: 5, value: LCPI280_1-4, kind: reloc_riprel_4byte  ; CHECK-NEXT:    retq ## encoding: [0xc3]    %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>, <8 x i32> zeroinitializer, i8 -1)    ret <8 x i32> %res @@ -4508,9 +4592,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) {  ; CHECK:       ## BB#0:  ; CHECK-NEXT:    vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,18446744073709551607]  ; CHECK-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; CHECK-NEXT:    ## fixup A - offset: 4, value: LCPI304_0-4, kind: reloc_riprel_4byte +; CHECK-NEXT:    ## fixup A - offset: 4, value: LCPI308_0-4, kind: reloc_riprel_4byte  ; CHECK-NEXT:    vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A] -; CHECK-NEXT:    ## fixup A - offset: 6, value: LCPI304_1-4, kind: reloc_riprel_4byte +; CHECK-NEXT:    ## fixup A - offset: 6, value: LCPI308_1-4, kind: reloc_riprel_4byte  ; CHECK-NEXT:    retq ## encoding: [0xc3]    %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> <i64 2, i64 -9>, <2 x i64> <i64 1, i64 90>, <2 x i64> zeroinitializer, i8 -1)    ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index bc10be199d5..9098ca30897 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4093,85 +4093,6 @@ define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2    ret i8 %res2  } -declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT:    vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf] -; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7] -; CHECK-NEXT:    vpbroadcastd %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xd7] -; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) -  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) -  %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) -  %res3 = add <8 x i32> %res, %res1 -  %res4 = add <8 x i32> %res2, %res3 -  ret <8 x i32> %res4 -} - -declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) - -define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT:    vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf] -; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7] -; CHECK-NEXT:    vpbroadcastd %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xd7] -; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) -  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) -  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) -  %res3 = add <4 x i32> %res, %res1 -  %res4 = add <4 x i32> %res2, %res3 -  ret <4 x i32> %res4 -} - -declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT:    vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf] -; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7] -; CHECK-NEXT:    vpbroadcastq %rdi, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xd7] -; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) -  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) -  %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) -  %res3 = add <4 x i64> %res, %res1 -  %res4 = add <4 x i64> %res2, %res3 -  ret <4 x i64> %res4 -} - -declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) - -define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { -; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: -; CHECK:       ## BB#0: -; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT:    vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf] -; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7] -; CHECK-NEXT:    vpbroadcastq %rdi, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd7] -; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] -; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; CHECK-NEXT:    retq ## encoding: [0xc3] -  %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) -  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) -  %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) -  %res3 = add <2 x i64> %res, %res1 -  %res4 = add <2 x i64> %res2, %res3 -  ret <2 x i64> %res4 -}  define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) {  | 

