diff options
Diffstat (limited to 'llvm/test/CodeGen/X86/sttni.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/sttni.ll | 1337 | 
1 files changed, 1337 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll new file mode 100644 index 00000000000..21a34969cfb --- /dev/null +++ b/llvm/test/CodeGen/X86/sttni.ll @@ -0,0 +1,1337 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X64 + +declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8) +declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8) +declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8) + +define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_eq_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_reg_eq_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_idx_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_reg_idx_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  ret i32 %idx +} + +define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_diff_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movl 8(%ebp), %eax +; X32-NEXT:    movl 12(%ebp), %edx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    cmpl $16, %ecx +; X32-NEXT:    jne .LBB2_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    jmp .LBB2_3 +; X32-NEXT:  .LBB2_2: # %compare +; X32-NEXT:    movdqa %xmm0, (%esp) +; X32-NEXT:    andl $15, %ecx +; X32-NEXT:    movb (%esp,%ecx), %al +; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT:    subb 16(%esp,%ecx), %al +; X32-NEXT:  .LBB2_3: # %exit +; X32-NEXT:    movzbl %al, %eax +; X32-NEXT:    movl %ebp, %esp +; X32-NEXT:    popl %ebp +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_reg_diff_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $16, %ecx +; X64-NEXT:    jne .LBB2_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB2_2: # %compare +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $15, %ecx +; X64-NEXT:    movb -24(%rsp,%rcx), %al +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subb -40(%rsp,%rcx), %al +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +entry: +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %eq = icmp eq i32 %idx, 16 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx +  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx +  %sub = sub i8 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i8 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i8 %result to i32 +  ret i32 %result_ext +} + +define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_eq_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movdqu (%esi), %xmm0 +; X32-NEXT:    pcmpestri $24, (%ecx), %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    popl %esi +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_mem_eq_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rdx, %r8 +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    movl %ecx, %edx +; X64-NEXT:    pcmpestri $24, (%r8), %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* +  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* +  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 +  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_idx_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movdqu (%esi), %xmm0 +; X32-NEXT:    pcmpestri $24, (%ecx), %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    popl %esi +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_mem_idx_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rdx, %r8 +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    movl %ecx, %edx +; X64-NEXT:    pcmpestri $24, (%r8), %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* +  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* +  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  ret i32 %idx +} + +define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_diff_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    pushl %esi +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movl 12(%ebp), %eax +; X32-NEXT:    movl 20(%ebp), %edx +; X32-NEXT:    movl 16(%ebp), %ecx +; X32-NEXT:    movl 8(%ebp), %esi +; X32-NEXT:    movdqu (%esi), %xmm1 +; X32-NEXT:    movdqu (%ecx), %xmm0 +; X32-NEXT:    pcmpestri $24, %xmm0, %xmm1 +; X32-NEXT:    cmpl $16, %ecx +; X32-NEXT:    jne .LBB5_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    jmp .LBB5_3 +; X32-NEXT:  .LBB5_2: # %compare +; X32-NEXT:    movdqa %xmm1, (%esp) +; X32-NEXT:    andl $15, %ecx +; X32-NEXT:    movb (%esp,%ecx), %al +; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT:    subb 16(%esp,%ecx), %al +; X32-NEXT:  .LBB5_3: # %exit +; X32-NEXT:    movzbl %al, %eax +; X32-NEXT:    leal -4(%ebp), %esp +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %ebp +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_mem_diff_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm1 +; X64-NEXT:    movdqu (%rdx), %xmm0 +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    movl %ecx, %edx +; X64-NEXT:    pcmpestri $24, %xmm0, %xmm1 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $16, %ecx +; X64-NEXT:    jne .LBB5_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB5_2: # %compare +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $15, %ecx +; X64-NEXT:    movb -24(%rsp,%rcx), %al +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subb -40(%rsp,%rcx), %al +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* +  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* +  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %eq = icmp eq i32 %idx, 16 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx +  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx +  %sub = sub i8 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i8 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i8 %result to i32 +  ret i32 %result_ext +} + +define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_eq_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_reg_eq_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_idx_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_reg_idx_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) +  ret i32 %idx +} + +define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_diff_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movl 8(%ebp), %eax +; X32-NEXT:    movl 12(%ebp), %edx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    cmpl $16, %ecx +; X32-NEXT:    jne .LBB8_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    jmp .LBB8_3 +; X32-NEXT:  .LBB8_2: # %compare +; X32-NEXT:    movdqa %xmm0, (%esp) +; X32-NEXT:    addl %ecx, %ecx +; X32-NEXT:    andl $14, %ecx +; X32-NEXT:    movzwl (%esp,%ecx), %eax +; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT:    subw 16(%esp,%ecx), %ax +; X32-NEXT:  .LBB8_3: # %exit +; X32-NEXT:    movzwl %ax, %eax +; X32-NEXT:    movl %ebp, %esp +; X32-NEXT:    popl %ebp +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_reg_diff_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $16, %ecx +; X64-NEXT:    jne .LBB8_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB8_2: # %compare +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $7, %ecx +; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +entry: +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) +  %eq = icmp eq i32 %idx, 16 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx +  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx +  %sub = sub i16 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i16 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i16 %result to i32 +  ret i32 %result_ext +} + +define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_eq_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movdqu (%esi), %xmm0 +; X32-NEXT:    pcmpestri $25, (%ecx), %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    popl %esi +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_mem_eq_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rdx, %r8 +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    movl %ecx, %edx +; X64-NEXT:    pcmpestri $25, (%r8), %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* +  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* +  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_idx_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movdqu (%esi), %xmm0 +; X32-NEXT:    pcmpestri $25, (%ecx), %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    popl %esi +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_mem_idx_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rdx, %r8 +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    movl %ecx, %edx +; X64-NEXT:    pcmpestri $25, (%r8), %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* +  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* +  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) +  ret i32 %idx +} + +define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_diff_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    pushl %esi +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movl 12(%ebp), %eax +; X32-NEXT:    movl 20(%ebp), %edx +; X32-NEXT:    movl 16(%ebp), %ecx +; X32-NEXT:    movl 8(%ebp), %esi +; X32-NEXT:    movdqu (%esi), %xmm1 +; X32-NEXT:    movdqu (%ecx), %xmm0 +; X32-NEXT:    pcmpestri $25, %xmm0, %xmm1 +; X32-NEXT:    cmpl $8, %ecx +; X32-NEXT:    jne .LBB11_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    jmp .LBB11_3 +; X32-NEXT:  .LBB11_2: # %compare +; X32-NEXT:    movdqa %xmm1, (%esp) +; X32-NEXT:    addl %ecx, %ecx +; X32-NEXT:    andl $14, %ecx +; X32-NEXT:    movzwl (%esp,%ecx), %eax +; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT:    subw 16(%esp,%ecx), %ax +; X32-NEXT:  .LBB11_3: # %exit +; X32-NEXT:    movzwl %ax, %eax +; X32-NEXT:    leal -4(%ebp), %esp +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %ebp +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_mem_diff_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm1 +; X64-NEXT:    movdqu (%rdx), %xmm0 +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    movl %ecx, %edx +; X64-NEXT:    pcmpestri $25, %xmm0, %xmm1 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $8, %ecx +; X64-NEXT:    jne .LBB11_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB11_2: # %compare +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $7, %ecx +; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* +  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* +  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) +  %eq = icmp eq i32 %idx, 8 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx +  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx +  %sub = sub i16 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i16 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i16 %result to i32 +  ret i32 %result_ext +} + +define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_eq_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_reg_eq_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_idx_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_reg_idx_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  ret i32 %idx +} + +define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_diff_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    cmpl $16, %ecx +; X32-NEXT:    jne .LBB14_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    movzbl %al, %eax +; X32-NEXT:    retl +; X32-NEXT:  .LBB14_2: # %compare +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movdqa %xmm0, (%esp) +; X32-NEXT:    andl $15, %ecx +; X32-NEXT:    movb (%esp,%ecx), %al +; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT:    subb 16(%esp,%ecx), %al +; X32-NEXT:    movl %ebp, %esp +; X32-NEXT:    popl %ebp +; X32-NEXT:    movzbl %al, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_reg_diff_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $16, %ecx +; X64-NEXT:    jne .LBB14_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB14_2: # %compare +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $15, %ecx +; X64-NEXT:    movb -24(%rsp,%rcx), %al +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subb -40(%rsp,%rcx), %al +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +entry: +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %eq = icmp eq i32 %idx, 16 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx +  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx +  %sub = sub i8 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i8 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i8 %result to i32 +  ret i32 %result_ext +} + +define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_eq_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movdqu (%ecx), %xmm0 +; X32-NEXT:    pcmpistri $24, (%eax), %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_mem_eq_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* +  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* +  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 +  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_idx_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movdqu (%ecx), %xmm0 +; X32-NEXT:    pcmpistri $24, (%eax), %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_mem_idx_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* +  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* +  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  ret i32 %idx +} + +define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_diff_i8: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movl 12(%ebp), %eax +; X32-NEXT:    movl 8(%ebp), %ecx +; X32-NEXT:    movdqu (%ecx), %xmm1 +; X32-NEXT:    movdqu (%eax), %xmm0 +; X32-NEXT:    pcmpistri $24, %xmm0, %xmm1 +; X32-NEXT:    cmpl $16, %ecx +; X32-NEXT:    jne .LBB17_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    jmp .LBB17_3 +; X32-NEXT:  .LBB17_2: # %compare +; X32-NEXT:    movdqa %xmm1, (%esp) +; X32-NEXT:    andl $15, %ecx +; X32-NEXT:    movb (%esp,%ecx), %al +; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT:    subb 16(%esp,%ecx), %al +; X32-NEXT:  .LBB17_3: # %exit +; X32-NEXT:    movzbl %al, %eax +; X32-NEXT:    movl %ebp, %esp +; X32-NEXT:    popl %ebp +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_mem_diff_i8: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm1 +; X64-NEXT:    movdqu (%rsi), %xmm0 +; X64-NEXT:    pcmpistri $24, %xmm0, %xmm1 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $16, %ecx +; X64-NEXT:    jne .LBB17_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB17_2: # %compare +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $15, %ecx +; X64-NEXT:    movb -24(%rsp,%rcx), %al +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subb -40(%rsp,%rcx), %al +; X64-NEXT:    movzbl %al, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* +  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* +  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %eq = icmp eq i32 %idx, 16 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx +  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx +  %sub = sub i8 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i8 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i8 %result to i32 +  ret i32 %result_ext +} + +define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_eq_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_reg_eq_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_idx_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_reg_idx_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) +  ret i32 %idx +} + +define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_diff_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    cmpl $16, %ecx +; X32-NEXT:    jne .LBB20_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    movzwl %ax, %eax +; X32-NEXT:    retl +; X32-NEXT:  .LBB20_2: # %compare +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movdqa %xmm0, (%esp) +; X32-NEXT:    addl %ecx, %ecx +; X32-NEXT:    andl $14, %ecx +; X32-NEXT:    movzwl (%esp,%ecx), %eax +; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT:    subw 16(%esp,%ecx), %ax +; X32-NEXT:    movl %ebp, %esp +; X32-NEXT:    popl %ebp +; X32-NEXT:    movzwl %ax, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_reg_diff_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $16, %ecx +; X64-NEXT:    jne .LBB20_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB20_2: # %compare +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $7, %ecx +; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +entry: +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) +  %eq = icmp eq i32 %idx, 16 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx +  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx +  %sub = sub i16 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i16 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i16 %result to i32 +  ret i32 %result_ext +} + +define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_eq_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movdqu (%ecx), %xmm0 +; X32-NEXT:    pcmpistri $25, (%eax), %xmm0 +; X32-NEXT:    setae %al +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_mem_eq_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0 +; X64-NEXT:    setae %al +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* +  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* +  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) +  %result = icmp eq i32 %c, 0 +  ret i1 %result +} + +define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_idx_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movdqu (%ecx), %xmm0 +; X32-NEXT:    pcmpistri $25, (%eax), %xmm0 +; X32-NEXT:    movl %ecx, %eax +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_mem_idx_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm0 +; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0 +; X64-NEXT:    movl %ecx, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* +  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* +  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) +  ret i32 %idx +} + +define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_diff_i16: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebp +; X32-NEXT:    movl %esp, %ebp +; X32-NEXT:    andl $-16, %esp +; X32-NEXT:    subl $48, %esp +; X32-NEXT:    movl 12(%ebp), %eax +; X32-NEXT:    movl 8(%ebp), %ecx +; X32-NEXT:    movdqu (%ecx), %xmm1 +; X32-NEXT:    movdqu (%eax), %xmm0 +; X32-NEXT:    pcmpistri $25, %xmm0, %xmm1 +; X32-NEXT:    cmpl $8, %ecx +; X32-NEXT:    jne .LBB23_2 +; X32-NEXT:  # %bb.1: +; X32-NEXT:    xorl %eax, %eax +; X32-NEXT:    jmp .LBB23_3 +; X32-NEXT:  .LBB23_2: # %compare +; X32-NEXT:    movdqa %xmm1, (%esp) +; X32-NEXT:    addl %ecx, %ecx +; X32-NEXT:    andl $14, %ecx +; X32-NEXT:    movzwl (%esp,%ecx), %eax +; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT:    subw 16(%esp,%ecx), %ax +; X32-NEXT:  .LBB23_3: # %exit +; X32-NEXT:    movzwl %ax, %eax +; X32-NEXT:    movl %ebp, %esp +; X32-NEXT:    popl %ebp +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistri_mem_diff_i16: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqu (%rdi), %xmm1 +; X64-NEXT:    movdqu (%rsi), %xmm0 +; X64-NEXT:    pcmpistri $25, %xmm0, %xmm1 +; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx +; X64-NEXT:    cmpl $8, %ecx +; X64-NEXT:    jne .LBB23_2 +; X64-NEXT:  # %bb.1: +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +; X64-NEXT:  .LBB23_2: # %compare +; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT:    andl $7, %ecx +; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax +; X64-NEXT:    movzwl %ax, %eax +; X64-NEXT:    retq +entry: +  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* +  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 +  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* +  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 +  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> +  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> +  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) +  %eq = icmp eq i32 %idx, 8 +  br i1 %eq, label %exit, label %compare + +compare: +  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx +  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx +  %sub = sub i16 %lhs_c, %rhs_c +  br label %exit + +exit: +  %result = phi i16 [ 0, %entry ], [ %sub, %compare ] +  %result_ext = zext i16 %result to i32 +  ret i32 %result_ext +} + +define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_index_flag: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebx +; X32-NEXT:    pushl %edi +; X32-NEXT:    pushl %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    xorl %ebx, %ebx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    setb %bl +; X32-NEXT:    movl %ecx, (%edi) +; X32-NEXT:    movl %ebx, (%esi) +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %edi +; X32-NEXT:    popl %ebx +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestr_index_flag: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rcx, %r8 +; X64-NEXT:    movq %rdx, %r9 +; X64-NEXT:    xorl %r10d, %r10d +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    setb %r10b +; X64-NEXT:    movl %ecx, (%r9) +; X64-NEXT:    movl %r10d, (%r8) +; X64-NEXT:    retq +entry: +  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  store i32 %index, i32* %iptr +  store i32 %flag, i32* %fptr +  ret void +} + +define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_mask_flag: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebx +; X32-NEXT:    pushl %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    xorl %ebx, %ebx +; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT:    setb %bl +; X32-NEXT:    movdqa %xmm0, (%esi) +; X32-NEXT:    movl %ebx, (%ecx) +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %ebx +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestr_mask_flag: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rdx, %r8 +; X64-NEXT:    xorl %r9d, %r9d +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT:    setb %r9b +; X64-NEXT:    movdqa %xmm0, (%r8) +; X64-NEXT:    movl %r9d, (%rcx) +; X64-NEXT:    retq +entry: +  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  store <16 x i8> %mask, <16 x i8>* %mptr +  store i32 %flag, i32* %fptr +  ret void +} + +define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr) nounwind { +; X32-LABEL: pcmpestr_mask_index: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %edi +; X32-NEXT:    pushl %esi +; X32-NEXT:    movdqa %xmm0, %xmm2 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm2 +; X32-NEXT:    movdqa %xmm0, (%edi) +; X32-NEXT:    movl %ecx, (%esi) +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %edi +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestr_mask_index: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rcx, %r8 +; X64-NEXT:    movq %rdx, %r9 +; X64-NEXT:    movdqa %xmm0, %xmm2 +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2 +; X64-NEXT:    movdqa %xmm0, (%r9) +; X64-NEXT:    movl %ecx, (%r8) +; X64-NEXT:    retq +entry: +  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  store <16 x i8> %mask, <16 x i8>* %mptr +  store i32 %index, i32* %iptr +  ret void +} + +define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpestr_mask_index_flag: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebp +; X32-NEXT:    pushl %ebx +; X32-NEXT:    pushl %edi +; X32-NEXT:    pushl %esi +; X32-NEXT:    movdqa %xmm0, %xmm2 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    pcmpestrm $24, %xmm1, %xmm0 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT:    xorl %ebx, %ebx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm2 +; X32-NEXT:    setb %bl +; X32-NEXT:    movdqa %xmm0, (%ebp) +; X32-NEXT:    movl %ecx, (%edi) +; X32-NEXT:    movl %ebx, (%esi) +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %edi +; X32-NEXT:    popl %ebx +; X32-NEXT:    popl %ebp +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestr_mask_index_flag: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rcx, %r9 +; X64-NEXT:    movq %rdx, %r10 +; X64-NEXT:    movdqa %xmm0, %xmm2 +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    movl %esi, %edx +; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0 +; X64-NEXT:    xorl %esi, %esi +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2 +; X64-NEXT:    setb %sil +; X64-NEXT:    movdqa %xmm0, (%r10) +; X64-NEXT:    movl %ecx, (%r9) +; X64-NEXT:    movl %esi, (%r8) +; X64-NEXT:    retq +entry: +  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  store <16 x i8> %mask, <16 x i8>* %mptr +  store i32 %index, i32* %iptr +  store i32 %flag, i32* %fptr +  ret void +} + +define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_index_flag: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    xorl %edx, %edx +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    setb %dl +; X32-NEXT:    movl %ecx, (%esi) +; X32-NEXT:    movl %edx, (%eax) +; X32-NEXT:    popl %esi +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistr_index_flag: +; X64:       # %bb.0: # %entry +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    setb %al +; X64-NEXT:    movl %ecx, (%rdi) +; X64-NEXT:    movl %eax, (%rsi) +; X64-NEXT:    retq +entry: +  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  store i32 %index, i32* %iptr +  store i32 %flag, i32* %fptr +  ret void +} + +define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_flag: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    xorl %ecx, %ecx +; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    setb %cl +; X32-NEXT:    movdqa %xmm0, (%edx) +; X32-NEXT:    movl %ecx, (%eax) +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistr_mask_flag: +; X64:       # %bb.0: # %entry +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT:    setb %al +; X64-NEXT:    movdqa %xmm0, (%rdi) +; X64-NEXT:    movl %eax, (%rsi) +; X64-NEXT:    retq +entry: +  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  store <16 x i8> %mask, <16 x i8>* %mptr +  store i32 %flag, i32* %fptr +  ret void +} + +define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr) nounwind { +; X32-LABEL: pcmpistr_mask_index: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movdqa %xmm0, (%edx) +; X32-NEXT:    movl %ecx, (%eax) +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistr_mask_index: +; X64:       # %bb.0: # %entry +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT:    movdqa %xmm0, (%rdi) +; X64-NEXT:    movl %ecx, (%rsi) +; X64-NEXT:    retq +entry: +  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  store <16 x i8> %mask, <16 x i8>* %mptr +  store i32 %index, i32* %iptr +  ret void +} + +define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_index_flag: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebx +; X32-NEXT:    pushl %esi +; X32-NEXT:    movdqa %xmm0, %xmm2 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    pcmpistrm $24, %xmm1, %xmm0 +; X32-NEXT:    xorl %ebx, %ebx +; X32-NEXT:    pcmpistri $24, %xmm1, %xmm2 +; X32-NEXT:    setb %bl +; X32-NEXT:    movdqa %xmm0, (%esi) +; X32-NEXT:    movl %ecx, (%edx) +; X32-NEXT:    movl %ebx, (%eax) +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %ebx +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistr_mask_index_flag: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movdqa %xmm0, %xmm2 +; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0 +; X64-NEXT:    xorl %eax, %eax +; X64-NEXT:    pcmpistri $24, %xmm1, %xmm2 +; X64-NEXT:    setb %al +; X64-NEXT:    movdqa %xmm0, (%rdi) +; X64-NEXT:    movl %ecx, (%rsi) +; X64-NEXT:    movl %eax, (%rdx) +; X64-NEXT:    retq +entry: +  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  store <16 x i8> %mask, <16 x i8>* %mptr +  store i32 %index, i32* %iptr +  store i32 %flag, i32* %fptr +  ret void +} + +; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri. +define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, <16 x i8>* %rhsptr, <16 x i8>* %mptr, i32* %iptr, i32* %fptr) nounwind { +; X32-LABEL: pcmpistr_mask_index_flag_load: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebx +; X32-NEXT:    pushl %esi +; X32-NEXT:    movdqa %xmm0, %xmm1 +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movdqu (%ecx), %xmm2 +; X32-NEXT:    pcmpistrm $24, %xmm2, %xmm0 +; X32-NEXT:    xorl %ebx, %ebx +; X32-NEXT:    pcmpistri $24, %xmm2, %xmm1 +; X32-NEXT:    setb %bl +; X32-NEXT:    movdqa %xmm0, (%esi) +; X32-NEXT:    movl %ecx, (%edx) +; X32-NEXT:    movl %ebx, (%eax) +; X32-NEXT:    popl %esi +; X32-NEXT:    popl %ebx +; X32-NEXT:    retl +; +; X64-LABEL: pcmpistr_mask_index_flag_load: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movq %rcx, %rax +; X64-NEXT:    movdqa %xmm0, %xmm1 +; X64-NEXT:    movdqu (%rdi), %xmm2 +; X64-NEXT:    pcmpistrm $24, %xmm2, %xmm0 +; X64-NEXT:    xorl %edi, %edi +; X64-NEXT:    pcmpistri $24, %xmm2, %xmm1 +; X64-NEXT:    setb %dil +; X64-NEXT:    movdqa %xmm0, (%rsi) +; X64-NEXT:    movl %ecx, (%rdx) +; X64-NEXT:    movl %edi, (%rax) +; X64-NEXT:    retq +entry: +  %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 1 +  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) +  store <16 x i8> %mask, <16 x i8>* %mptr +  store i32 %index, i32* %iptr +  store i32 %flag, i32* %fptr +  ret void +} + +; Make sure we don't fold nontemporal loads. +define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, <16 x i8>* %rhsptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_nontemporal: +; X32:       # %bb.0: # %entry +; X32-NEXT:    pushl %ebx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT:    movntdqa (%ecx), %xmm1 +; X32-NEXT:    xorl %ebx, %ebx +; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT:    setb %bl +; X32-NEXT:    movl %ebx, %eax +; X32-NEXT:    popl %ebx +; X32-NEXT:    retl +; +; X64-LABEL: pcmpestri_nontemporal: +; X64:       # %bb.0: # %entry +; X64-NEXT:    movntdqa (%rsi), %xmm1 +; X64-NEXT:    xorl %esi, %esi +; X64-NEXT:    movl %edi, %eax +; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT:    setb %sil +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    retq +entry: +  %rhs = load <16 x i8>, <16 x i8>* %rhsptr, align 16, !nontemporal !0 +  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) +  ret i32 %flag +} + +!0 = !{ i32 1 }  | 

