diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-05-19 16:55:52 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-05-19 16:55:52 +0000 |
commit | 7a8dcf25564accc21c94dff0106bcf16eecaba5a (patch) | |
tree | c213bf12b0b49326e26e34d859587e3488f65631 /llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll | |
parent | b1ff2dd145567331d3f2db8cff3d14b297883c7c (diff) | |
download | bcm5719-llvm-7a8dcf25564accc21c94dff0106bcf16eecaba5a.tar.gz bcm5719-llvm-7a8dcf25564accc21c94dff0106bcf16eecaba5a.zip |
[X86][SSE] Added fast-isel tests to sync with clang/test/CodeGen/sse-builtins.c
llvm-svn: 270081
Diffstat (limited to 'llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll')
-rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 2280 |
1 files changed, 2280 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll new file mode 100644 index 00000000000..1c9791967cb --- /dev/null +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -0,0 +1,2280 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32 +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c + +define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_add_ps: +; X32: # BB#0: +; X32-NEXT: addps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_add_ps: +; X64: # BB#0: +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: retq + %res = fadd <4 x float> %a0, %a1 + ret <4 x float> %res +} + +define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_add_ss: +; X32: # BB#0: +; X32-NEXT: addss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_add_ss: +; X64: # BB#0: +; X64-NEXT: addss %xmm1, %xmm0 +; X64-NEXT: retq + %ext0 = extractelement <4 x float> %a0, i32 0 + %ext1 = extractelement <4 x float> %a1, i32 0 + %fadd = fadd float %ext0, %ext1 + %res = insertelement <4 x float> %a0, float %fadd, i32 0 + ret <4 x float> %res +} + +define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_and_ps: +; X32: # BB#0: +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test_mm_and_ps: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: andl %eax, %edx +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: andl %r8d, %ecx +; X64-NEXT: shrq $32, %r8 +; X64-NEXT: shrq $32, %rsi +; X64-NEXT: shrq $32, %rdi +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl %r8d, %edi +; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl %eax, %esi +; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: retq + %arg0 = bitcast <4 x float> %a0 to <4 x i32> + %arg1 = bitcast <4 x float> %a1 to <4 x i32> + %res = and <4 x i32> %arg0, %arg1 + %bc = bitcast <4 x i32> %res to <4 x float> + ret <4 x float> %bc +} + +define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_andnot_ps: +; X32: # BB#0: +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: notl %edx +; X32-NEXT: notl %ecx +; X32-NEXT: notl %esi +; X32-NEXT: notl %eax +; X32-NEXT: andl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test_mm_andnot_ps: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: shrq $32, %rdx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: shrq $32, %rsi +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 +; X64-NEXT: notl %eax +; X64-NEXT: andl %edi, %eax +; X64-NEXT: shrq $32, %rdi +; X64-NEXT: notl %ecx +; X64-NEXT: andl %r8d, %ecx +; X64-NEXT: shrq $32, %r8 +; X64-NEXT: notl %esi +; X64-NEXT: notl %edx +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl %r8d, %edx +; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl %edi, %esi +; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: retq + %arg0 = bitcast <4 x float> %a0 to <4 x i32> + %arg1 = bitcast <4 x float> %a1 to <4 x i32> + %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1> + %res = and <4 x i32> %not, %arg1 + %bc = bitcast <4 x i32> %res to <4 x float> + ret <4 x float> %bc +} + +define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpeq_ps: +; X32: # BB#0: +; X32-NEXT: cmpeqps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpeq_ps: +; X64: # BB#0: +; X64-NEXT: cmpeqps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpeq_ss: +; X32: # BB#0: +; X32-NEXT: cmpeqss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpeq_ss: +; X64: # BB#0: +; X64-NEXT: cmpeqss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpge_ps: +; X32: # BB#0: +; X32-NEXT: cmpleps %xmm0, %xmm1 +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpge_ps: +; X64: # BB#0: +; X64-NEXT: cmpleps %xmm0, %xmm1 +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 2) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpge_ss: +; X32: # BB#0: +; X32-NEXT: cmpless %xmm0, %xmm1 +; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpge_ss: +; X64: # BB#0: +; X64-NEXT: cmpless %xmm0, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2) + %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpgt_ps: +; X32: # BB#0: +; X32-NEXT: cmpltps %xmm0, %xmm1 +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpgt_ps: +; X64: # BB#0: +; X64-NEXT: cmpltps %xmm0, %xmm1 +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 1) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpgt_ss: +; X32: # BB#0: +; X32-NEXT: cmpltss %xmm0, %xmm1 +; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpgt_ss: +; X64: # BB#0: +; X64-NEXT: cmpltss %xmm0, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1) + %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmple_ps: +; X32: # BB#0: +; X32-NEXT: cmpleps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmple_ps: +; X64: # BB#0: +; X64-NEXT: cmpleps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 2) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmple_ss: +; X32: # BB#0: +; X32-NEXT: cmpless %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmple_ss: +; X64: # BB#0: +; X64-NEXT: cmpless %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmplt_ps: +; X32: # BB#0: +; X32-NEXT: cmpltps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmplt_ps: +; X64: # BB#0: +; X64-NEXT: cmpltps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 1) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmplt_ss: +; X32: # BB#0: +; X32-NEXT: cmpltss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmplt_ss: +; X64: # BB#0: +; X64-NEXT: cmpltss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpneq_ps: +; X32: # BB#0: +; X32-NEXT: cmpneqps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpneq_ps: +; X64: # BB#0: +; X64-NEXT: cmpneqps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 4) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpneq_ss: +; X32: # BB#0: +; X32-NEXT: cmpneqss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpneq_ss: +; X64: # BB#0: +; X64-NEXT: cmpneqss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpnge_ps: +; X32: # BB#0: +; X32-NEXT: cmpnleps %xmm0, %xmm1 +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpnge_ps: +; X64: # BB#0: +; X64-NEXT: cmpnleps %xmm0, %xmm1 +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 6) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpnge_ss: +; X32: # BB#0: +; X32-NEXT: cmpnless %xmm0, %xmm1 +; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpnge_ss: +; X64: # BB#0: +; X64-NEXT: cmpnless %xmm0, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6) + %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpngt_ps: +; X32: # BB#0: +; X32-NEXT: cmpnltps %xmm0, %xmm1 +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpngt_ps: +; X64: # BB#0: +; X64-NEXT: cmpnltps %xmm0, %xmm1 +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a1, <4 x float> %a0, i8 5) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpngt_ss: +; X32: # BB#0: +; X32-NEXT: cmpnltss %xmm0, %xmm1 +; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpngt_ss: +; X64: # BB#0: +; X64-NEXT: cmpnltss %xmm0, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5) + %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpnle_ps: +; X32: # BB#0: +; X32-NEXT: cmpnleps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpnle_ps: +; X64: # BB#0: +; X64-NEXT: cmpnleps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 6) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpnle_ss: +; X32: # BB#0: +; X32-NEXT: cmpnless %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpnle_ss: +; X64: # BB#0: +; X64-NEXT: cmpnless %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpnlt_ps: +; X32: # BB#0: +; X32-NEXT: cmpnltps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpnlt_ps: +; X64: # BB#0: +; X64-NEXT: cmpnltps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 5) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpnlt_ss: +; X32: # BB#0: +; X32-NEXT: cmpnltss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpnlt_ss: +; X64: # BB#0: +; X64-NEXT: cmpnltss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpord_ps: +; X32: # BB#0: +; X32-NEXT: cmpordps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpord_ps: +; X64: # BB#0: +; X64-NEXT: cmpordps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpord_ss: +; X32: # BB#0: +; X32-NEXT: cmpordss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpord_ss: +; X64: # BB#0: +; X64-NEXT: cmpordss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpunord_ps: +; X32: # BB#0: +; X32-NEXT: cmpunordps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpunord_ps: +; X64: # BB#0: +; X64-NEXT: cmpunordps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 3) + ret <4 x float> %res +} + +define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_cmpunord_ss: +; X32: # BB#0: +; X32-NEXT: cmpunordss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cmpunord_ss: +; X64: # BB#0: +; X64-NEXT: cmpunordss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3) + ret <4 x float> %res +} + +define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_comieq_ss: +; X32: # BB#0: +; X32-NEXT: comiss %xmm1, %xmm0 +; X32-NEXT: setnp %al +; X32-NEXT: sete %cl +; X32-NEXT: andb %al, %cl +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_comieq_ss: +; X64: # BB#0: +; X64-NEXT: comiss %xmm1, %xmm0 +; X64-NEXT: setnp %al +; X64-NEXT: sete %cl +; X64-NEXT: andb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_comige_ss: +; X32: # BB#0: +; X32-NEXT: comiss %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_comige_ss: +; X64: # BB#0: +; X64-NEXT: comiss %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_comigt_ss: +; X32: # BB#0: +; X32-NEXT: comiss %xmm1, %xmm0 +; X32-NEXT: seta %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_comigt_ss: +; X64: # BB#0: +; X64-NEXT: comiss %xmm1, %xmm0 +; X64-NEXT: seta %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_comile_ss: +; X32: # BB#0: +; X32-NEXT: comiss %xmm0, %xmm1 +; X32-NEXT: setae %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_comile_ss: +; X64: # BB#0: +; X64-NEXT: comiss %xmm0, %xmm1 +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_comilt_ss: +; X32: # BB#0: +; X32-NEXT: comiss %xmm0, %xmm1 +; X32-NEXT: seta %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_comilt_ss: +; X64: # BB#0: +; X64-NEXT: comiss %xmm0, %xmm1 +; X64-NEXT: seta %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_comineq_ss: +; X32: # BB#0: +; X32-NEXT: comiss %xmm1, %xmm0 +; X32-NEXT: setp %al +; X32-NEXT: setne %cl +; X32-NEXT: orb %al, %cl +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_comineq_ss: +; X64: # BB#0: +; X64-NEXT: comiss %xmm1, %xmm0 +; X64-NEXT: setp %al +; X64-NEXT: setne %cl +; X64-NEXT: orb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind { +; X32-LABEL: test_mm_cvt_ss2si: +; X32: # BB#0: +; X32-NEXT: cvtss2si %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvt_ss2si: +; X64: # BB#0: +; X64-NEXT: cvtss2si %xmm0, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) + ret i32 %res +} +declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone + +define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind { +; X32-LABEL: test_mm_cvtsi32_ss: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: cvtsi2ssl %eax, %xmm1 +; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvtsi32_ss: +; X64: # BB#0: +; X64-NEXT: cvtsi2ssl %edi, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %cvt = sitofp i32 %a1 to float + %res = insertelement <4 x float> %a0, float %cvt, i32 0 + ret <4 x float> %res +} + +define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind { +; X32-LABEL: test_mm_cvtss_f32: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: movss %xmm0, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvtss_f32: +; X64: # BB#0: +; X64-NEXT: retq + %res = extractelement <4 x float> %a0, i32 0 + ret float %res +} + +define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind { +; X32-LABEL: test_mm_cvtss_si32: +; X32: # BB#0: +; X32-NEXT: cvtss2si %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvtss_si32: +; X64: # BB#0: +; X64-NEXT: cvtss2si %xmm0, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) + ret i32 %res +} + +define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind { +; X32-LABEL: test_mm_cvttss_si: +; X32: # BB#0: +; X32-NEXT: cvttss2si %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvttss_si: +; X64: # BB#0: +; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: retq + %cvt = extractelement <4 x float> %a0, i32 0 + %res = fptosi float %cvt to i32 + ret i32 %res +} + +define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind { +; X32-LABEL: test_mm_cvttss_si32: +; X32: # BB#0: +; X32-NEXT: cvttss2si %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_cvttss_si32: +; X64: # BB#0: +; X64-NEXT: cvttss2si %xmm0, %eax +; X64-NEXT: retq + %cvt = extractelement <4 x float> %a0, i32 0 + %res = fptosi float %cvt to i32 + ret i32 %res +} + +define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_div_ps: +; X32: # BB#0: +; X32-NEXT: divps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_div_ps: +; X64: # BB#0: +; X64-NEXT: divps %xmm1, %xmm0 +; X64-NEXT: retq + %res = fdiv <4 x float> %a0, %a1 + ret <4 x float> %res +} + +define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_div_ss: +; X32: # BB#0: +; X32-NEXT: divss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_div_ss: +; X64: # BB#0: +; X64-NEXT: divss %xmm1, %xmm0 +; X64-NEXT: retq + %ext0 = extractelement <4 x float> %a0, i32 0 + %ext1 = extractelement <4 x float> %a1, i32 0 + %fdiv = fdiv float %ext0, %ext1 + %res = insertelement <4 x float> %a0, float %fdiv, i32 0 + ret <4 x float> %res +} + +define i32 @test_MM_GET_EXCEPTION_MASK() nounwind { +; X32-LABEL: test_MM_GET_EXCEPTION_MASK: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: leal (%esp), %eax +; X32-NEXT: stmxcsr (%eax) +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: andl $8064, %eax # imm = 0x1F80 +; X32-NEXT: popl %ecx +; X32-NEXT: retl +; +; X64-LABEL: test_MM_GET_EXCEPTION_MASK: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $8064, %eax # imm = 0x1F80 +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1, align 4 + %4 = and i32 %3, 8064 + ret i32 %4 +} +declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone + +define i32 @test_MM_GET_EXCEPTION_STATE() nounwind { +; X32-LABEL: test_MM_GET_EXCEPTION_STATE: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: leal (%esp), %eax +; X32-NEXT: stmxcsr (%eax) +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: andl $63, %eax +; X32-NEXT: popl %ecx +; X32-NEXT: retl +; +; X64-LABEL: test_MM_GET_EXCEPTION_STATE: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $63, %eax +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1, align 4 + %4 = and i32 %3, 63 + ret i32 %4 +} + +define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind { +; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: leal (%esp), %eax +; X32-NEXT: stmxcsr (%eax) +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: andl $32768, %eax # imm = 0x8000 +; X32-NEXT: popl %ecx +; X32-NEXT: retl +; +; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $32768, %eax # imm = 0x8000 +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1, align 4 + %4 = and i32 %3, 32768 + ret i32 %4 +} + +define i32 @test_MM_GET_ROUNDING_MODE() nounwind { +; X32-LABEL: test_MM_GET_ROUNDING_MODE: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: leal (%esp), %eax +; X32-NEXT: stmxcsr (%eax) +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: andl $24576, %eax # imm = 0x6000 +; X32-NEXT: popl %ecx +; X32-NEXT: retl +; +; X64-LABEL: test_MM_GET_ROUNDING_MODE: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $24576, %eax # imm = 0x6000 +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1, align 4 + %4 = and i32 %3, 24576 + ret i32 %4 +} + +define i32 @test_mm_getcsr() nounwind { +; X32-LABEL: test_mm_getcsr: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: leal (%esp), %eax +; X32-NEXT: stmxcsr (%eax) +; X32-NEXT: movl (%esp), %eax +; X32-NEXT: popl %ecx +; X32-NEXT: retl +; +; X64-LABEL: test_mm_getcsr: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1, align 4 + ret i32 %3 +} + +define <4 x float> @test_mm_load_ps(float* %a0) nounwind { +; X32-LABEL: test_mm_load_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movaps (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_load_ps: +; X64: # BB#0: +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + %res = load <4 x float>, <4 x float>* %arg0, align 16 + ret <4 x float> %res +} + +define <4 x float> @test_mm_load_ps1(float* %a0) nounwind { +; X32-LABEL: test_mm_load_ps1: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_load_ps1: +; X64: # BB#0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: retq + %ld = load float, float* %a0, align 4 + %res0 = insertelement <4 x float> undef, float %ld, i32 0 + %res1 = insertelement <4 x float> %res0, float %ld, i32 1 + %res2 = insertelement <4 x float> %res1, float %ld, i32 2 + %res3 = insertelement <4 x float> %res2, float %ld, i32 3 + ret <4 x float> %res3 +} + +define <4 x float> @test_mm_load_ss(float* %a0) nounwind { +; X32-LABEL: test_mm_load_ss: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: retl +; +; X64-LABEL: test_mm_load_ss: +; X64: # BB#0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: retq + %ld = load float, float* %a0, align 1 + %res0 = insertelement <4 x float> undef, float %ld, i32 0 + %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 + %res2 = insertelement <4 x float> %res1, float 0.0, i32 2 + %res3 = insertelement <4 x float> %res2, float 0.0, i32 3 + ret <4 x float> %res3 +} + +define <4 x float> @test_mm_load1_ps(float* %a0) nounwind { +; X32-LABEL: test_mm_load1_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_load1_ps: +; X64: # BB#0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: retq + %ld = load float, float* %a0, align 4 + %res0 = insertelement <4 x float> undef, float %ld, i32 0 + %res1 = insertelement <4 x float> %res0, float %ld, i32 1 + %res2 = insertelement <4 x float> %res1, float %ld, i32 2 + %res3 = insertelement <4 x float> %res2, float %ld, i32 3 + ret <4 x float> %res3 +} + +define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) { +; X32-LABEL: test_mm_loadh_pi: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_loadh_pi: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: retq + %ptr = bitcast x86_mmx* %a1 to <2 x float>* + %ld = load <2 x float>, <2 x float>* %ptr + %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + ret <4 x float> %res +} + +define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) { +; X32-LABEL: test_mm_loadl_pi: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_loadl_pi: +; X64: # BB#0: +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + %ptr = bitcast x86_mmx* %a1 to <2 x float>* + %ld = load <2 x float>, <2 x float>* %ptr + %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + ret <4 x float> %res +} + +define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind { +; X32-LABEL: test_mm_loadr_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movaps (%eax), %xmm0 +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_loadr_ps: +; X64: # BB#0: +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + %ld = load <4 x float>, <4 x float>* %arg0, align 16 + %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x float> %res +} + +define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind { +; X32-LABEL: test_mm_loadu_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movups (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_loadu_ps: +; X64: # BB#0: +; X64-NEXT: movups (%rdi), %xmm0 +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + %res = load <4 x float>, <4 x float>* %arg0, align 1 + ret <4 x float> %res +} + +define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) { +; X32-LABEL: test_mm_max_ps: +; X32: # BB#0: +; X32-NEXT: maxps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_max_ps: +; X64: # BB#0: +; X64-NEXT: maxps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) { +; X32-LABEL: test_mm_max_ss: +; X32: # BB#0: +; X32-NEXT: maxss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_max_ss: +; X64: # BB#0: +; X64-NEXT: maxss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) { +; X32-LABEL: test_mm_min_ps: +; X32: # BB#0: +; X32-NEXT: minps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_min_ps: +; X64: # BB#0: +; X64-NEXT: minps %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) { +; X32-LABEL: test_mm_min_ss: +; X32: # BB#0: +; X32-NEXT: minss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_min_ss: +; X64: # BB#0: +; X64-NEXT: minss %xmm1, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) { +; X32-LABEL: test_mm_move_ss: +; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_move_ss: +; X64: # BB#0: +; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %res +} + +define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) { +; X32-LABEL: test_mm_movehl_ps: +; X32: # BB#0: +; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_movehl_ps: +; X64: # BB#0: +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; X64-NEXT: retq + %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> + ret <4 x float> %res +} + +define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) { +; X32-LABEL: test_mm_movelh_ps: +; X32: # BB#0: +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_movelh_ps: +; X64: # BB#0: +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: retq + %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + ret <4 x float> %res +} + +define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind { +; X32-LABEL: test_mm_movemask_ps: +; X32: # BB#0: +; X32-NEXT: movmskps %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_movemask_ps: +; X64: # BB#0: +; X64-NEXT: movmskps %xmm0, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) + ret i32 %res +} +declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone + +define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_mul_ps: +; X32: # BB#0: +; X32-NEXT: mulps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mul_ps: +; X64: # BB#0: +; X64-NEXT: mulps %xmm1, %xmm0 +; X64-NEXT: retq + %res = fmul <4 x float> %a0, %a1 + ret <4 x float> %res +} + +define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_mul_ss: +; X32: # BB#0: +; X32-NEXT: mulss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mul_ss: +; X64: # BB#0: +; X64-NEXT: mulss %xmm1, %xmm0 +; X64-NEXT: retq + %ext0 = extractelement <4 x float> %a0, i32 0 + %ext1 = extractelement <4 x float> %a1, i32 0 + %fmul = fmul float %ext0, %ext1 + %res = insertelement <4 x float> %a0, float %fmul, i32 0 + ret <4 x float> %res +} + +define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_or_ps: +; X32: # BB#0: +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: orl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: orl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: orl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test_mm_or_ps: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: orl %eax, %edx +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: orl %r8d, %ecx +; X64-NEXT: shrq $32, %r8 +; X64-NEXT: shrq $32, %rsi +; X64-NEXT: shrq $32, %rdi +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; X64-NEXT: orl %r8d, %edi +; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: orl %eax, %esi +; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: retq + %arg0 = bitcast <4 x float> %a0 to <4 x i32> + %arg1 = bitcast <4 x float> %a1 to <4 x i32> + %res = or <4 x i32> %arg0, %arg1 + %bc = bitcast <4 x i32> %res to <4 x float> + ret <4 x float> %bc +} + +define void @test_mm_prefetch(i8* %a0) { +; X32-LABEL: test_mm_prefetch: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: prefetchnta (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_prefetch: +; X64: # BB#0: +; X64-NEXT: prefetchnta (%rdi) +; X64-NEXT: retq + call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1) + ret void +} +declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone + +define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) { +; X32-LABEL: test_mm_rcp_ps: +; X32: # BB#0: +; X32-NEXT: rcpps %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_rcp_ps: +; X64: # BB#0: +; X64-NEXT: rcpps %xmm0, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) { +; X32-LABEL: test_mm_rcp_ss: +; X32: # BB#0: +; X32-NEXT: rcpss %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_rcp_ss: +; X64: # BB#0: +; X64-NEXT: rcpss %xmm0, %xmm0 +; X64-NEXT: retq + %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) + %ext0 = extractelement <4 x float> %rcp, i32 0 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ext1 = extractelement <4 x float> %a0, i32 1 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ext2 = extractelement <4 x float> %a0, i32 2 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ext3 = extractelement <4 x float> %a0, i32 3 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + ret <4 x float> %ins3 +} +declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + +define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) { +; X32-LABEL: test_mm_rsqrt_ps: +; X32: # BB#0: +; X32-NEXT: rsqrtps %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_rsqrt_ps: +; X64: # BB#0: +; X64-NEXT: rsqrtps %xmm0, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) { +; X32-LABEL: test_mm_rsqrt_ss: +; X32: # BB#0: +; X32-NEXT: rsqrtss %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_rsqrt_ss: +; X64: # BB#0: +; X64-NEXT: rsqrtss %xmm0, %xmm0 +; X64-NEXT: retq + %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) + %ext0 = extractelement <4 x float> %rsqrt, i32 0 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ext1 = extractelement <4 x float> %a0, i32 1 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ext2 = extractelement <4 x float> %a0, i32 2 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ext3 = extractelement <4 x float> %a0, i32 3 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + ret <4 x float> %ins3 +} +declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone + +define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind { +; X32-LABEL: test_MM_SET_EXCEPTION_MASK: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: leal (%esp), %ecx +; X32-NEXT: stmxcsr (%ecx) +; X32-NEXT: movl (%esp), %edx +; X32-NEXT: andl $-8065, %edx # imm = 0xFFFFFFFFFFFFE07F +; X32-NEXT: orl %eax, %edx +; X32-NEXT: movl %edx, (%esp) +; X32-NEXT: ldmxcsr (%ecx) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: test_MM_SET_EXCEPTION_MASK: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $-8065, %ecx # imm = 0xFFFFFFFFFFFFE07F +; X64-NEXT: orl %edi, %ecx +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr (%rax) +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1 + %4 = and i32 %3, -8065 + %5 = or i32 %4, %a0 + store i32 %5, i32* %1 + call void @llvm.x86.sse.ldmxcsr(i8* %2) + ret void +} +declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone + +define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind { +; X32-LABEL: test_MM_SET_EXCEPTION_STATE: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: leal (%esp), %ecx +; X32-NEXT: stmxcsr (%ecx) +; X32-NEXT: movl (%esp), %edx +; X32-NEXT: andl $-64, %edx +; X32-NEXT: orl %eax, %edx +; X32-NEXT: movl %edx, (%esp) +; X32-NEXT: ldmxcsr (%ecx) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: test_MM_SET_EXCEPTION_STATE: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $-64, %ecx +; X64-NEXT: orl %edi, %ecx +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr (%rax) +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1 + %4 = and i32 %3, -64 + %5 = or i32 %4, %a0 + store i32 %5, i32* %1 + call void @llvm.x86.sse.ldmxcsr(i8* %2) + ret void +} + +define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind { +; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: leal (%esp), %ecx +; X32-NEXT: stmxcsr (%ecx) +; X32-NEXT: movl (%esp), %edx +; X32-NEXT: andl $-32769, %edx # imm = 0xFFFFFFFFFFFF7FFF +; X32-NEXT: orl %eax, %edx +; X32-NEXT: movl %edx, (%esp) +; X32-NEXT: ldmxcsr (%ecx) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $-32769, %ecx # imm = 0xFFFFFFFFFFFF7FFF +; X64-NEXT: orl %edi, %ecx +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr (%rax) +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1 + %4 = and i32 %3, -32769 + %5 = or i32 %4, %a0 + store i32 %5, i32* %1 + call void @llvm.x86.sse.ldmxcsr(i8* %2) + ret void +} + +define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind { +; X32-LABEL: test_mm_set_ps: +; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_set_ps: +; X64: # BB#0: +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: movaps %xmm3, %xmm0 +; X64-NEXT: retq + %res0 = insertelement <4 x float> undef, float %a3, i32 0 + %res1 = insertelement <4 x float> %res0, float %a2, i32 1 + %res2 = insertelement <4 x float> %res1, float %a1, i32 2 + %res3 = insertelement <4 x float> %res2, float %a0, i32 3 + ret <4 x float> %res3 +} + +define <4 x float> @test_mm_set_ps1(float %a0) nounwind { +; X32-LABEL: test_mm_set_ps1: +; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_set_ps1: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: retq + %res0 = insertelement <4 x float> undef, float %a0, i32 0 + %res1 = insertelement <4 x float> %res0, float %a0, i32 1 + %res2 = insertelement <4 x float> %res1, float %a0, i32 2 + %res3 = insertelement <4 x float> %res2, float %a0, i32 3 + ret <4 x float> %res3 +} + +define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind { +; X32-LABEL: test_MM_SET_ROUNDING_MODE: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: leal (%esp), %ecx +; X32-NEXT: stmxcsr (%ecx) +; X32-NEXT: movl (%esp), %edx +; X32-NEXT: andl $-24577, %edx # imm = 0xFFFFFFFFFFFF9FFF +; X32-NEXT: orl %eax, %edx +; X32-NEXT: movl %edx, (%esp) +; X32-NEXT: ldmxcsr (%ecx) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: test_MM_SET_ROUNDING_MODE: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: stmxcsr (%rax) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $-24577, %ecx # imm = 0xFFFFFFFFFFFF9FFF +; X64-NEXT: orl %edi, %ecx +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr (%rax) +; X64-NEXT: retq + %1 = alloca i32, align 4 + %2 = bitcast i32* %1 to i8* + call void @llvm.x86.sse.stmxcsr(i8* %2) + %3 = load i32, i32* %1 + %4 = and i32 %3, -24577 + %5 = or i32 %4, %a0 + store i32 %5, i32* %1 + call void @llvm.x86.sse.ldmxcsr(i8* %2) + ret void +} + +define <4 x float> @test_mm_set_ss(float %a0) nounwind { +; X32-LABEL: test_mm_set_ss: +; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: xorps %xmm0, %xmm0 +; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_set_ss: +; X64: # BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq + %res0 = insertelement <4 x float> undef, float %a0, i32 0 + %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 + %res2 = insertelement <4 x float> %res1, float 0.0, i32 2 + %res3 = insertelement <4 x float> %res2, float 0.0, i32 3 + ret <4 x float> %res3 +} + +define <4 x float> @test_mm_set1_ps(float %a0) nounwind { +; X32-LABEL: test_mm_set1_ps: +; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_set1_ps: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: retq + %res0 = insertelement <4 x float> undef, float %a0, i32 0 + %res1 = insertelement <4 x float> %res0, float %a0, i32 1 + %res2 = insertelement <4 x float> %res1, float %a0, i32 2 + %res3 = insertelement <4 x float> %res2, float %a0, i32 3 + ret <4 x float> %res3 +} + +define void @test_mm_setcsr(i32 %a0) nounwind { +; X32-LABEL: test_mm_setcsr: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: leal (%esp), %ecx +; X32-NEXT: movl %eax, (%esp) +; X32-NEXT: ldmxcsr (%ecx) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_setcsr: +; X64: # BB#0: +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr (%rax) +; X64-NEXT: retq + %st = alloca i32, align 4 + store i32 %a0, i32* %st, align 4 + %bc = bitcast i32* %st to i8* + call void @llvm.x86.sse.ldmxcsr(i8* %bc) + ret void +} + +define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { +; X32-LABEL: test_mm_setr_ps: +; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_setr_ps: +; X64: # BB#0: +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: retq + %res0 = insertelement <4 x float> undef, float %a0, i32 0 + %res1 = insertelement <4 x float> %res0, float %a1, i32 1 + %res2 = insertelement <4 x float> %res1, float %a2, i32 2 + %res3 = insertelement <4 x float> %res2, float %a3, i32 3 + ret <4 x float> %res3 +} + +define <4 x float> @test_mm_setzero_ps() { +; X32-LABEL: test_mm_setzero_ps: +; X32: # BB#0: +; X32-NEXT: xorps %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_setzero_ps: +; X64: # BB#0: +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: retq + ret <4 x float> zeroinitializer +} + +define void @test_mm_sfence() nounwind { +; X32-LABEL: test_mm_sfence: +; X32: # BB#0: +; X32-NEXT: sfence +; X32-NEXT: retl +; +; X64-LABEL: test_mm_sfence: +; X64: # BB#0: +; X64-NEXT: sfence +; X64-NEXT: retq + call void @llvm.x86.sse.sfence() + ret void +} +declare void @llvm.x86.sse.sfence() nounwind readnone + +define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_shuffle_ps: +; X32: # BB#0: +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_shuffle_ps: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; X64-NEXT: retq + %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4> + ret <4 x float> %res +} + +define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) { +; X32-LABEL: test_mm_sqrt_ps: +; X32: # BB#0: +; X32-NEXT: sqrtps %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_sqrt_ps: +; X64: # BB#0: +; X64-NEXT: sqrtps %xmm0, %xmm0 +; X64-NEXT: retq + %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) { +; X32-LABEL: test_mm_sqrt_ss: +; X32: # BB#0: +; X32-NEXT: sqrtss %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_sqrt_ss: +; X64: # BB#0: +; X64-NEXT: sqrtss %xmm0, %xmm0 +; X64-NEXT: retq + %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) + %ext0 = extractelement <4 x float> %sqrt, i32 0 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ext1 = extractelement <4 x float> %a0, i32 1 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ext2 = extractelement <4 x float> %a0, i32 2 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ext3 = extractelement <4 x float> %a0, i32 3 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + ret <4 x float> %ins3 +} +declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone + +define void @test_mm_store_ps(float *%a0, <4 x float> %a1) { +; X32-LABEL: test_mm_store_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movaps %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_store_ps: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + store <4 x float> %a1, <4 x float>* %arg0, align 16 + ret void +} + +define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) { +; X32-LABEL: test_mm_store_ps1: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-NEXT: movaps %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_store_ps1: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %shuf, <4 x float>* %arg0, align 16 + ret void +} + +define void @test_mm_store_ss(float *%a0, <4 x float> %a1) { +; X32-LABEL: test_mm_store_ss: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_store_ss: +; X64: # BB#0: +; X64-NEXT: movss %xmm0, (%rdi) +; X64-NEXT: retq + %ext = extractelement <4 x float> %a1, i32 0 + store float %ext, float* %a0, align 1 + ret void +} + +define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) { +; X32-LABEL: test_mm_store1_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-NEXT: movaps %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_store1_ps: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %shuf, <4 x float>* %arg0, align 16 + ret void +} + +define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_storeh_ps: +; X32: # BB#0: +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $32, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movaps %xmm0, (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, 4(%eax) +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test_mm_storeh_ps: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: retq + %ptr = bitcast x86_mmx* %a0 to i64* + %bc = bitcast <4 x float> %a1 to <2 x i64> + %ext = extractelement <2 x i64> %bc, i32 1 + store i64 %ext, i64* %ptr + ret void +} + +define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_storel_ps: +; X32: # BB#0: +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $32, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movaps %xmm0, (%esp) +; X32-NEXT: movl (%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, 4(%eax) +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test_mm_storel_ps: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: retq + %ptr = bitcast x86_mmx* %a0 to i64* + %bc = bitcast <4 x float> %a1 to <2 x i64> + %ext = extractelement <2 x i64> %bc, i32 0 + store i64 %ext, i64* %ptr + ret void +} + +define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) { +; X32-LABEL: test_mm_storer_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X32-NEXT: movaps %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_storer_ps: +; X64: # BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + store <4 x float> %shuf, <4 x float>* %arg0, align 16 + ret void +} + +define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) { +; X32-LABEL: test_mm_storeu_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movups %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_storeu_ps: +; X64: # BB#0: +; X64-NEXT: movups %xmm0, (%rdi) +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + store <4 x float> %a1, <4 x float>* %arg0, align 1 + ret void +} + +define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) { +; X32-LABEL: test_mm_stream_ps: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movntps %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_mm_stream_ps: +; X64: # BB#0: +; X64-NEXT: movntps %xmm0, (%rdi) +; X64-NEXT: retq + %arg0 = bitcast float* %a0 to <4 x float>* + store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0 + ret void +} + +define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_sub_ps: +; X32: # BB#0: +; X32-NEXT: subps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_sub_ps: +; X64: # BB#0: +; X64-NEXT: subps %xmm1, %xmm0 +; X64-NEXT: retq + %res = fsub <4 x float> %a0, %a1 + ret <4 x float> %res +} + +define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_sub_ss: +; X32: # BB#0: +; X32-NEXT: subss %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm_sub_ss: +; X64: # BB#0: +; X64-NEXT: subss %xmm1, %xmm0 +; X64-NEXT: retq + %ext0 = extractelement <4 x float> %a0, i32 0 + %ext1 = extractelement <4 x float> %a1, i32 0 + %fsub = fsub float %ext0, %ext1 + %res = insertelement <4 x float> %a0, float %fsub, i32 0 + ret <4 x float> %res +} + +define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind { +; X32-LABEL: test_MM_TRANSPOSE4_PS: +; X32: # BB#0: +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movaps (%esi), %xmm0 +; X32-NEXT: movaps (%edx), %xmm1 +; X32-NEXT: movaps (%ecx), %xmm2 +; X32-NEXT: movaps (%eax), %xmm3 +; X32-NEXT: movaps %xmm0, %xmm4 +; X32-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X32-NEXT: movaps %xmm2, %xmm5 +; X32-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X32-NEXT: movaps %xmm4, %xmm1 +; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; X32-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] +; X32-NEXT: movaps %xmm0, %xmm3 +; X32-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X32-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; X32-NEXT: movaps %xmm1, (%esi) +; X32-NEXT: movaps %xmm5, (%edx) +; X32-NEXT: movaps %xmm3, (%ecx) +; X32-NEXT: movaps %xmm2, (%eax) +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: test_MM_TRANSPOSE4_PS: +; X64: # BB#0: +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: movaps (%rsi), %xmm1 +; X64-NEXT: movaps (%rdx), %xmm2 +; X64-NEXT: movaps (%rcx), %xmm3 +; X64-NEXT: movaps %xmm0, %xmm4 +; X64-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-NEXT: movaps %xmm2, %xmm5 +; X64-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-NEXT: movaps %xmm4, %xmm1 +; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; X64-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] +; X64-NEXT: movaps %xmm0, %xmm3 +; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X64-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; X64-NEXT: movaps %xmm1, (%rdi) +; X64-NEXT: movaps %xmm5, (%rsi) +; X64-NEXT: movaps %xmm3, (%rdx) +; X64-NEXT: movaps %xmm2, (%rcx) +; X64-NEXT: retq + %row0 = load <4 x float>, <4 x float>* %a0, align 16 + %row1 = load <4 x float>, <4 x float>* %a1, align 16 + %row2 = load <4 x float>, <4 x float>* %a2, align 16 + %row3 = load <4 x float>, <4 x float>* %a3, align 16 + %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3> + %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> + store <4 x float> %res0, <4 x float>* %a0, align 16 + store <4 x float> %res1, <4 x float>* %a1, align 16 + store <4 x float> %res2, <4 x float>* %a2, align 16 + store <4 x float> %res3, <4 x float>* %a3, align 16 + ret void +} + +define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_ucomieq_ss: +; X32: # BB#0: +; X32-NEXT: ucomiss %xmm1, %xmm0 +; X32-NEXT: setnp %al +; X32-NEXT: sete %cl +; X32-NEXT: andb %al, %cl +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_ucomieq_ss: +; X64: # BB#0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setnp %al +; X64-NEXT: sete %cl +; X64-NEXT: andb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_ucomige_ss: +; X32: # BB#0: +; X32-NEXT: ucomiss %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_ucomige_ss: +; X64: # BB#0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_ucomigt_ss: +; X32: # BB#0: +; X32-NEXT: ucomiss %xmm1, %xmm0 +; X32-NEXT: seta %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_ucomigt_ss: +; X64: # BB#0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: seta %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_ucomile_ss: +; X32: # BB#0: +; X32-NEXT: ucomiss %xmm0, %xmm1 +; X32-NEXT: setae %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_ucomile_ss: +; X64: # BB#0: +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_ucomilt_ss: +; X32: # BB#0: +; X32-NEXT: ucomiss %xmm0, %xmm1 +; X32-NEXT: seta %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_ucomilt_ss: +; X64: # BB#0: +; X64-NEXT: ucomiss %xmm0, %xmm1 +; X64-NEXT: seta %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone + +define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_ucomineq_ss: +; X32: # BB#0: +; X32-NEXT: ucomiss %xmm1, %xmm0 +; X32-NEXT: setp %al +; X32-NEXT: setne %cl +; X32-NEXT: orb %al, %cl +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: retl +; +; X64-LABEL: test_mm_ucomineq_ss: +; X64: # BB#0: +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setp %al +; X64-NEXT: setne %cl +; X64-NEXT: orb %al, %cl +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: retq + %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) + ret i32 %res +} +declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone + +define <4 x float> @test_mm_undefined_ps() { +; X32-LABEL: test_mm_undefined_ps: +; X32: # BB#0: +; X32-NEXT: retl +; +; X64-LABEL: test_mm_undefined_ps: +; X64: # BB#0: +; X64-NEXT: retq + ret <4 x float> undef +} + +define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_unpackhi_ps: +; X32: # BB#0: +; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_unpackhi_ps: +; X64: # BB#0: +; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: retq + %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> + ret <4 x float> %res +} + +define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_unpacklo_ps: +; X32: # BB#0: +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_unpacklo_ps: +; X64: # BB#0: +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: retq + %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x float> %res +} + +define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind { +; X32-LABEL: test_mm_xor_ps: +; X32: # BB#0: +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $64, %esp +; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: xorl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test_mm_xor_ps: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: xorl %eax, %edx +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: xorl %r8d, %ecx +; X64-NEXT: shrq $32, %r8 +; X64-NEXT: shrq $32, %rsi +; X64-NEXT: shrq $32, %rdi +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; X64-NEXT: xorl %r8d, %edi +; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: xorl %eax, %esi +; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: retq + %arg0 = bitcast <4 x float> %a0 to <4 x i32> + %arg1 = bitcast <4 x float> %a1 to <4 x i32> + %res = xor <4 x i32> %arg0, %arg1 + %bc = bitcast <4 x i32> %res to <4 x float> + ret <4 x float> %bc +} + +!0 = !{i32 1} |