diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-06-02 20:25:56 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-06-02 20:25:56 +0000 |
| commit | 58ff2ecc4be562b7302309338a5c0bf980eb93f9 (patch) | |
| tree | 2b2cd63e486b5f5f5e818d3fd1a1b5aa30220878 | |
| parent | 87908448486b6d076bad6802d82ae544577acdcf (diff) | |
| download | bcm5719-llvm-58ff2ecc4be562b7302309338a5c0bf980eb93f9.tar.gz bcm5719-llvm-58ff2ecc4be562b7302309338a5c0bf980eb93f9.zip | |
[X86][SSE] Cleanup SSE1 intrinsics tests
Ensure we cover 32/64-bit targets for SSE/AVX/AVX512 cases as necessary
llvm-svn: 333833
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll | 43 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 3277 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll | 149 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-x86.ll | 692 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll | 22 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll | 38 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll | 27 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll | 720 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse1.ll | 220 |
9 files changed, 3097 insertions, 2091 deletions
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll index 753f787e2d9..78335b6551c 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll @@ -1,33 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind { -; X64-LABEL: test_mm_cvtsi64_ss: -; X64: # %bb.0: -; X64-NEXT: cvtsi2ssq %rdi, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cvtsi64_ss: +; SSE: # %bb.0: +; SSE-NEXT: cvtsi2ssq %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_mm_cvtsi64_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 +; AVX-NEXT: retq %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind { -; X64-LABEL: test_mm_cvtss_si64: -; X64: # %bb.0: -; X64-NEXT: cvtss2si %xmm0, %rax -; X64-NEXT: retq +; SSE-LABEL: test_mm_cvtss_si64: +; SSE: # %bb.0: +; SSE-NEXT: cvtss2si %xmm0, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: test_mm_cvtss_si64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtss2si %xmm0, %rax +; AVX-NEXT: retq %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ret i64 %res } declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind { -; X64-LABEL: test_mm_cvttss_si64: -; X64: # %bb.0: -; X64-NEXT: cvttss2si %xmm0, %rax -; X64-NEXT: retq +; SSE-LABEL: test_mm_cvttss_si64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: test_mm_cvttss_si64: +; AVX: # %bb.0: +; AVX-NEXT: vcvttss2si %xmm0, %rax +; AVX-NEXT: retq %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ret i64 %res } diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index f592ea0b381..d47bf63f9e5 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1,33 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64 +; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE +; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1 +; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512 +; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE +; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1 +; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_add_ps: -; X32: # %bb.0: -; X32-NEXT: addps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_add_ps: -; X64: # %bb.0: -; X64-NEXT: addps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_add_ps: +; SSE: # %bb.0: +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_add_ps: +; AVX: # %bb.0: +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = fadd <4 x float> %a0, %a1 ret <4 x float> %res } define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_add_ss: -; X32: # %bb.0: -; X32-NEXT: addss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_add_ss: -; X64: # %bb.0: -; X64-NEXT: addss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_add_ss: +; SSE: # %bb.0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_add_ss: +; AVX: # %bb.0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %ext0 = extractelement <4 x float> %a0, i32 0 %ext1 = extractelement <4 x float> %a1, i32 0 %fadd = fadd float %ext0, %ext1 @@ -36,15 +40,15 @@ define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind { } define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_and_ps: -; X32: # %bb.0: -; X32-NEXT: andps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_and_ps: -; X64: # %bb.0: -; X64-NEXT: andps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_and_ps: +; SSE: # %bb.0: +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_and_ps: +; AVX: # %bb.0: +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> %res = and <4 x i32> %arg0, %arg1 @@ -53,15 +57,15 @@ define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind { } define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_andnot_ps: -; X32: # %bb.0: -; X32-NEXT: andnps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_andnot_ps: -; X64: # %bb.0: -; X64-NEXT: andnps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_andnot_ps: +; SSE: # %bb.0: +; SSE-NEXT: andnps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_andnot_ps: +; AVX: # %bb.0: +; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1> @@ -71,15 +75,21 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpeq_ps: -; X32: # %bb.0: -; X32-NEXT: cmpeqps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpeq_ps: -; X64: # %bb.0: -; X64-NEXT: cmpeqps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpeq_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpeqps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpeq_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpeq_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp oeq <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -87,32 +97,37 @@ define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpeq_ss: -; X32: # %bb.0: -; X32-NEXT: cmpeqss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpeq_ss: -; X64: # %bb.0: -; X64-NEXT: cmpeqss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpeq_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpeqss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmpeq_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpge_ps: -; X32: # %bb.0: -; X32-NEXT: cmpleps %xmm0, %xmm1 -; X32-NEXT: movaps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpge_ps: -; X64: # %bb.0: -; X64-NEXT: cmpleps %xmm0, %xmm1 -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpge_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpleps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpge_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpleps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpge_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpleps %xmm0, %xmm1, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp ole <4 x float> %a1, %a0 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -120,34 +135,45 @@ define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpge_ss: -; X32: # %bb.0: -; X32-NEXT: cmpless %xmm0, %xmm1 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpge_ss: -; X64: # %bb.0: -; X64-NEXT: cmpless %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpge_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpless %xmm0, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpge_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpless %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpge_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpless %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ret <4 x float> %res } define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpgt_ps: -; X32: # %bb.0: -; X32-NEXT: cmpltps %xmm0, %xmm1 -; X32-NEXT: movaps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpgt_ps: -; X64: # %bb.0: -; X64-NEXT: cmpltps %xmm0, %xmm1 -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpgt_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpltps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpgt_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpgt_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp olt <4 x float> %a1, %a0 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -155,32 +181,44 @@ define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpgt_ss: -; X32: # %bb.0: -; X32-NEXT: cmpltss %xmm0, %xmm1 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpgt_ss: -; X64: # %bb.0: -; X64-NEXT: cmpltss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpgt_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpltss %xmm0, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpgt_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpgt_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ret <4 x float> %res } define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmple_ps: -; X32: # %bb.0: -; X32-NEXT: cmpleps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmple_ps: -; X64: # %bb.0: -; X64-NEXT: cmpleps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmple_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpleps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmple_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpleps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmple_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpleps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp ole <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -188,29 +226,35 @@ define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmple_ss: -; X32: # %bb.0: -; X32-NEXT: cmpless %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmple_ss: -; X64: # %bb.0: -; X64-NEXT: cmpless %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmple_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpless %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmple_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpless %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2) ret <4 x float> %res } define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmplt_ps: -; X32: # %bb.0: -; X32-NEXT: cmpltps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmplt_ps: -; X64: # %bb.0: -; X64-NEXT: cmpltps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmplt_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpltps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmplt_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmplt_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpltps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp olt <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -218,29 +262,35 @@ define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmplt_ss: -; X32: # %bb.0: -; X32-NEXT: cmpltss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmplt_ss: -; X64: # %bb.0: -; X64-NEXT: cmpltss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmplt_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpltss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmplt_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpltss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1) ret <4 x float> %res } define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpneq_ps: -; X32: # %bb.0: -; X32-NEXT: cmpneqps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpneq_ps: -; X64: # %bb.0: -; X64-NEXT: cmpneqps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpneq_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpneqps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpneq_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpneqps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpneq_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpneqps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp une <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -248,31 +298,36 @@ define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpneq_ss: -; X32: # %bb.0: -; X32-NEXT: cmpneqss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpneq_ss: -; X64: # %bb.0: -; X64-NEXT: cmpneqss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpneq_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpneqss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmpneq_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpneqss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4) ret <4 x float> %res } define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpnge_ps: -; X32: # %bb.0: -; X32-NEXT: cmpnleps %xmm0, %xmm1 -; X32-NEXT: movaps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpnge_ps: -; X64: # %bb.0: -; X64-NEXT: cmpnleps %xmm0, %xmm1 -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpnge_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpnleps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpnge_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnleps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpnge_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnleps %xmm0, %xmm1, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp ugt <4 x float> %a1, %a0 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -280,34 +335,45 @@ define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpnge_ss: -; X32: # %bb.0: -; X32-NEXT: cmpnless %xmm0, %xmm1 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpnge_ss: -; X64: # %bb.0: -; X64-NEXT: cmpnless %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpnge_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpnless %xmm0, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpnge_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpnge_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ret <4 x float> %res } define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpngt_ps: -; X32: # %bb.0: -; X32-NEXT: cmpnltps %xmm0, %xmm1 -; X32-NEXT: movaps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpngt_ps: -; X64: # %bb.0: -; X64-NEXT: cmpnltps %xmm0, %xmm1 -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpngt_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpnltps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpngt_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnltps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpngt_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnltps %xmm0, %xmm1, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp uge <4 x float> %a1, %a0 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -315,32 +381,44 @@ define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpngt_ss: -; X32: # %bb.0: -; X32-NEXT: cmpnltss %xmm0, %xmm1 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpngt_ss: -; X64: # %bb.0: -; X64-NEXT: cmpnltss %xmm0, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpngt_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpnltss %xmm0, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpngt_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpngt_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ret <4 x float> %res } define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpnle_ps: -; X32: # %bb.0: -; X32-NEXT: cmpnleps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpnle_ps: -; X64: # %bb.0: -; X64-NEXT: cmpnleps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpnle_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpnleps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpnle_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnleps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpnle_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnleps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp ugt <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -348,29 +426,35 @@ define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpnle_ss: -; X32: # %bb.0: -; X32-NEXT: cmpnless %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpnle_ss: -; X64: # %bb.0: -; X64-NEXT: cmpnless %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpnle_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpnless %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmpnle_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpnless %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6) ret <4 x float> %res } define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpnlt_ps: -; X32: # %bb.0: -; X32-NEXT: cmpnltps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpnlt_ps: -; X64: # %bb.0: -; X64-NEXT: cmpnltps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpnlt_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpnltps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpnlt_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnltps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpnlt_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnltps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp uge <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -378,29 +462,35 @@ define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpnlt_ss: -; X32: # %bb.0: -; X32-NEXT: cmpnltss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpnlt_ss: -; X64: # %bb.0: -; X64-NEXT: cmpnltss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpnlt_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpnltss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmpnlt_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpnltss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5) ret <4 x float> %res } define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpord_ps: -; X32: # %bb.0: -; X32-NEXT: cmpordps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpord_ps: -; X64: # %bb.0: -; X64-NEXT: cmpordps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpord_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpordps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpord_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpordps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpord_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpordps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp ord <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -408,29 +498,35 @@ define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind } define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpord_ss: -; X32: # %bb.0: -; X32-NEXT: cmpordss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpord_ss: -; X64: # %bb.0: -; X64-NEXT: cmpordss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpord_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpordss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmpord_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpordss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ret <4 x float> %res } define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpunord_ps: -; X32: # %bb.0: -; X32-NEXT: cmpunordps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpunord_ps: -; X64: # %bb.0: -; X64-NEXT: cmpunordps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpunord_ps: +; SSE: # %bb.0: +; SSE-NEXT: cmpunordps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_cmpunord_ps: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpunordps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_cmpunord_ps: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpunordps %xmm1, %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %cmp = fcmp uno <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -438,179 +534,203 @@ define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwi } define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_cmpunord_ss: -; X32: # %bb.0: -; X32-NEXT: cmpunordss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cmpunord_ss: -; X64: # %bb.0: -; X64-NEXT: cmpunordss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_cmpunord_ss: +; SSE: # %bb.0: +; SSE-NEXT: cmpunordss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cmpunord_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3) ret <4 x float> %res } define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_comieq_ss: -; X32: # %bb.0: -; X32-NEXT: comiss %xmm1, %xmm0 -; X32-NEXT: setnp %al -; X32-NEXT: sete %cl -; X32-NEXT: andb %al, %cl -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_comieq_ss: -; X64: # %bb.0: -; X64-NEXT: comiss %xmm1, %xmm0 -; X64-NEXT: setnp %al -; X64-NEXT: sete %cl -; X64-NEXT: andb %al, %cl -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_comieq_ss: +; SSE: # %bb.0: +; SSE-NEXT: comiss %xmm1, %xmm0 +; SSE-NEXT: setnp %al +; SSE-NEXT: sete %cl +; SSE-NEXT: andb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_comieq_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcomiss %xmm1, %xmm0 +; AVX-NEXT: setnp %al +; AVX-NEXT: sete %cl +; AVX-NEXT: andb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_comige_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: comiss %xmm1, %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_comige_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: comiss %xmm1, %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_comige_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: comiss %xmm1, %xmm0 +; SSE-NEXT: setae %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_comige_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vcomiss %xmm1, %xmm0 +; AVX-NEXT: setae %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_comigt_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: comiss %xmm1, %xmm0 -; X32-NEXT: seta %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_comigt_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: comiss %xmm1, %xmm0 -; X64-NEXT: seta %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_comigt_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: comiss %xmm1, %xmm0 +; SSE-NEXT: seta %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_comigt_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vcomiss %xmm1, %xmm0 +; AVX-NEXT: seta %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_comile_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: comiss %xmm0, %xmm1 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_comile_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: comiss %xmm0, %xmm1 -; X64-NEXT: setae %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_comile_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: comiss %xmm0, %xmm1 +; SSE-NEXT: setae %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_comile_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: setae %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_comilt_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: comiss %xmm0, %xmm1 -; X32-NEXT: seta %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_comilt_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: comiss %xmm0, %xmm1 -; X64-NEXT: seta %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_comilt_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: comiss %xmm0, %xmm1 +; SSE-NEXT: seta %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_comilt_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: seta %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_comineq_ss: -; X32: # %bb.0: -; X32-NEXT: comiss %xmm1, %xmm0 -; X32-NEXT: setp %al -; X32-NEXT: setne %cl -; X32-NEXT: orb %al, %cl -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_comineq_ss: -; X64: # %bb.0: -; X64-NEXT: comiss %xmm1, %xmm0 -; X64-NEXT: setp %al -; X64-NEXT: setne %cl -; X64-NEXT: orb %al, %cl -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_comineq_ss: +; SSE: # %bb.0: +; SSE-NEXT: comiss %xmm1, %xmm0 +; SSE-NEXT: setp %al +; SSE-NEXT: setne %cl +; SSE-NEXT: orb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_comineq_ss: +; AVX: # %bb.0: +; AVX-NEXT: vcomiss %xmm1, %xmm0 +; AVX-NEXT: setp %al +; AVX-NEXT: setne %cl +; AVX-NEXT: orb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind { -; X32-LABEL: test_mm_cvt_ss2si: -; X32: # %bb.0: -; X32-NEXT: cvtss2si %xmm0, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cvt_ss2si: -; X64: # %bb.0: -; X64-NEXT: cvtss2si %xmm0, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_cvt_ss2si: +; SSE: # %bb.0: +; SSE-NEXT: cvtss2si %xmm0, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cvt_ss2si: +; AVX: # %bb.0: +; AVX-NEXT: vcvtss2si %xmm0, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ret i32 %res } declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind { -; X32-LABEL: test_mm_cvtsi32_ss: -; X32: # %bb.0: -; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cvtsi32_ss: -; X64: # %bb.0: -; X64-NEXT: cvtsi2ssl %edi, %xmm0 -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_cvtsi32_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_mm_cvtsi32_ss: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_cvtsi32_ss: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-SSE-LABEL: test_mm_cvtsi32_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: cvtsi2ssl %edi, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_cvtsi32_ss: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind { -; X32-LABEL: test_mm_cvtss_f32: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movss %xmm0, (%esp) -; X32-NEXT: flds (%esp) -; X32-NEXT: popl %eax -; X32-NEXT: retl +; X86-SSE-LABEL: test_mm_cvtss_f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: flds (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_cvtss_f32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: vmovss %xmm0, (%esp) +; X86-AVX-NEXT: flds (%esp) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: retl ; ; X64-LABEL: test_mm_cvtss_f32: ; X64: # %bb.0: @@ -620,72 +740,72 @@ define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind { } define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind { -; X32-LABEL: test_mm_cvtss_si32: -; X32: # %bb.0: -; X32-NEXT: cvtss2si %xmm0, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cvtss_si32: -; X64: # %bb.0: -; X64-NEXT: cvtss2si %xmm0, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_cvtss_si32: +; SSE: # %bb.0: +; SSE-NEXT: cvtss2si %xmm0, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cvtss_si32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtss2si %xmm0, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ret i32 %res } define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind { -; X32-LABEL: test_mm_cvttss_si: -; X32: # %bb.0: -; X32-NEXT: cvttss2si %xmm0, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cvttss_si: -; X64: # %bb.0: -; X64-NEXT: cvttss2si %xmm0, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_cvttss_si: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cvttss_si: +; AVX: # %bb.0: +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ret i32 %res } declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind { -; X32-LABEL: test_mm_cvttss_si32: -; X32: # %bb.0: -; X32-NEXT: cvttss2si %xmm0, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_cvttss_si32: -; X64: # %bb.0: -; X64-NEXT: cvttss2si %xmm0, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_cvttss_si32: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_cvttss_si32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ret i32 %res } define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_div_ps: -; X32: # %bb.0: -; X32-NEXT: divps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_div_ps: -; X64: # %bb.0: -; X64-NEXT: divps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_div_ps: +; SSE: # %bb.0: +; SSE-NEXT: divps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_div_ps: +; AVX: # %bb.0: +; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = fdiv <4 x float> %a0, %a1 ret <4 x float> %res } define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_div_ss: -; X32: # %bb.0: -; X32-NEXT: divss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_div_ss: -; X64: # %bb.0: -; X64-NEXT: divss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_div_ss: +; SSE: # %bb.0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_div_ss: +; AVX: # %bb.0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %ext0 = extractelement <4 x float> %a0, i32 0 %ext1 = extractelement <4 x float> %a1, i32 0 %fdiv = fdiv float %ext0, %ext1 @@ -694,23 +814,41 @@ define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind { } define i32 @test_MM_GET_EXCEPTION_MASK() nounwind { -; X32-LABEL: test_MM_GET_EXCEPTION_MASK: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl %esp, %eax -; X32-NEXT: stmxcsr (%eax) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: andl $8064, %eax # imm = 0x1F80 -; X32-NEXT: popl %ecx -; X32-NEXT: retl -; -; X64-LABEL: test_MM_GET_EXCEPTION_MASK: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: andl $8064, %eax # imm = 0x1F80 -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_GET_EXCEPTION_MASK: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl %esp, %eax +; X86-SSE-NEXT: stmxcsr (%eax) +; X86-SSE-NEXT: movl (%esp), %eax +; X86-SSE-NEXT: andl $8064, %eax # imm = 0x1F80 +; X86-SSE-NEXT: popl %ecx +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_GET_EXCEPTION_MASK: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl %esp, %eax +; X86-AVX-NEXT: vstmxcsr (%eax) +; X86-AVX-NEXT: movl (%esp), %eax +; X86-AVX-NEXT: andl $8064, %eax # imm = 0x1F80 +; X86-AVX-NEXT: popl %ecx +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_GET_EXCEPTION_MASK: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-SSE-NEXT: andl $8064, %eax # imm = 0x1F80 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_GET_EXCEPTION_MASK: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-AVX-NEXT: andl $8064, %eax # imm = 0x1F80 +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -721,23 +859,41 @@ define i32 @test_MM_GET_EXCEPTION_MASK() nounwind { declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone define i32 @test_MM_GET_EXCEPTION_STATE() nounwind { -; X32-LABEL: test_MM_GET_EXCEPTION_STATE: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl %esp, %eax -; X32-NEXT: stmxcsr (%eax) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: andl $63, %eax -; X32-NEXT: popl %ecx -; X32-NEXT: retl -; -; X64-LABEL: test_MM_GET_EXCEPTION_STATE: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: andl $63, %eax -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_GET_EXCEPTION_STATE: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl %esp, %eax +; X86-SSE-NEXT: stmxcsr (%eax) +; X86-SSE-NEXT: movl (%esp), %eax +; X86-SSE-NEXT: andl $63, %eax +; X86-SSE-NEXT: popl %ecx +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_GET_EXCEPTION_STATE: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl %esp, %eax +; X86-AVX-NEXT: vstmxcsr (%eax) +; X86-AVX-NEXT: movl (%esp), %eax +; X86-AVX-NEXT: andl $63, %eax +; X86-AVX-NEXT: popl %ecx +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_GET_EXCEPTION_STATE: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-SSE-NEXT: andl $63, %eax +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_GET_EXCEPTION_STATE: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-AVX-NEXT: andl $63, %eax +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -747,23 +903,41 @@ define i32 @test_MM_GET_EXCEPTION_STATE() nounwind { } define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind { -; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl %esp, %eax -; X32-NEXT: stmxcsr (%eax) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: andl $32768, %eax # imm = 0x8000 -; X32-NEXT: popl %ecx -; X32-NEXT: retl -; -; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: andl $32768, %eax # imm = 0x8000 -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl %esp, %eax +; X86-SSE-NEXT: stmxcsr (%eax) +; X86-SSE-NEXT: movl (%esp), %eax +; X86-SSE-NEXT: andl $32768, %eax # imm = 0x8000 +; X86-SSE-NEXT: popl %ecx +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl %esp, %eax +; X86-AVX-NEXT: vstmxcsr (%eax) +; X86-AVX-NEXT: movl (%esp), %eax +; X86-AVX-NEXT: andl $32768, %eax # imm = 0x8000 +; X86-AVX-NEXT: popl %ecx +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-SSE-NEXT: andl $32768, %eax # imm = 0x8000 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-AVX-NEXT: andl $32768, %eax # imm = 0x8000 +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -773,23 +947,41 @@ define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind { } define i32 @test_MM_GET_ROUNDING_MODE() nounwind { -; X32-LABEL: test_MM_GET_ROUNDING_MODE: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl %esp, %eax -; X32-NEXT: stmxcsr (%eax) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: andl $24576, %eax # imm = 0x6000 -; X32-NEXT: popl %ecx -; X32-NEXT: retl -; -; X64-LABEL: test_MM_GET_ROUNDING_MODE: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: andl $24576, %eax # imm = 0x6000 -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_GET_ROUNDING_MODE: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl %esp, %eax +; X86-SSE-NEXT: stmxcsr (%eax) +; X86-SSE-NEXT: movl (%esp), %eax +; X86-SSE-NEXT: andl $24576, %eax # imm = 0x6000 +; X86-SSE-NEXT: popl %ecx +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_GET_ROUNDING_MODE: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl %esp, %eax +; X86-AVX-NEXT: vstmxcsr (%eax) +; X86-AVX-NEXT: movl (%esp), %eax +; X86-AVX-NEXT: andl $24576, %eax # imm = 0x6000 +; X86-AVX-NEXT: popl %ecx +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_GET_ROUNDING_MODE: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-SSE-NEXT: andl $24576, %eax # imm = 0x6000 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_GET_ROUNDING_MODE: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-AVX-NEXT: andl $24576, %eax # imm = 0x6000 +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -799,21 +991,37 @@ define i32 @test_MM_GET_ROUNDING_MODE() nounwind { } define i32 @test_mm_getcsr() nounwind { -; X32-LABEL: test_mm_getcsr: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl %esp, %eax -; X32-NEXT: stmxcsr (%eax) -; X32-NEXT: movl (%esp), %eax -; X32-NEXT: popl %ecx -; X32-NEXT: retl -; -; X64-LABEL: test_mm_getcsr: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_getcsr: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl %esp, %eax +; X86-SSE-NEXT: stmxcsr (%eax) +; X86-SSE-NEXT: movl (%esp), %eax +; X86-SSE-NEXT: popl %ecx +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_getcsr: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl %esp, %eax +; X86-AVX-NEXT: vstmxcsr (%eax) +; X86-AVX-NEXT: movl (%esp), %eax +; X86-AVX-NEXT: popl %ecx +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_getcsr: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_getcsr: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -822,34 +1030,56 @@ define i32 @test_mm_getcsr() nounwind { } define <4 x float> @test_mm_load_ps(float* %a0) nounwind { -; X32-LABEL: test_mm_load_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps (%eax), %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_load_ps: -; X64: # %bb.0: -; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_load_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps (%eax), %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_load_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovaps (%eax), %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_load_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps (%rdi), %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_load_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* %res = load <4 x float>, <4 x float>* %arg0, align 16 ret <4 x float> %res } define <4 x float> @test_mm_load_ps1(float* %a0) nounwind { -; X32-LABEL: test_mm_load_ps1: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_load_ps1: -; X64: # %bb.0: -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_load_ps1: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_load_ps1: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vbroadcastss (%eax), %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_load_ps1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_load_ps1: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; X64-AVX-NEXT: retq %ld = load float, float* %a0, align 4 %res0 = insertelement <4 x float> undef, float %ld, i32 0 %res1 = insertelement <4 x float> %res0, float %ld, i32 1 @@ -859,16 +1089,27 @@ define <4 x float> @test_mm_load_ps1(float* %a0) nounwind { } define <4 x float> @test_mm_load_ss(float* %a0) nounwind { -; X32-LABEL: test_mm_load_ss: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: retl -; -; X64-LABEL: test_mm_load_ss: -; X64: # %bb.0: -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_load_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_load_ss: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_load_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_load_ss: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: retq %ld = load float, float* %a0, align 1 %res0 = insertelement <4 x float> undef, float %ld, i32 0 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 @@ -878,18 +1119,29 @@ define <4 x float> @test_mm_load_ss(float* %a0) nounwind { } define <4 x float> @test_mm_load1_ps(float* %a0) nounwind { -; X32-LABEL: test_mm_load1_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_load1_ps: -; X64: # %bb.0: -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_load1_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_load1_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vbroadcastss (%eax), %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_load1_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_load1_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; X64-AVX-NEXT: retq %ld = load float, float* %a0, align 4 %res0 = insertelement <4 x float> undef, float %ld, i32 0 %res1 = insertelement <4 x float> %res0, float %ld, i32 1 @@ -899,26 +1151,38 @@ define <4 x float> @test_mm_load1_ps(float* %a0) nounwind { } define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) { -; X32-LABEL: test_mm_loadh_pi: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_loadh_pi: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrq $32, %rax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_loadh_pi: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_loadh_pi: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_loadh_pi: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movq (%rdi), %rax +; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: shrq $32, %rax +; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_loadh_pi: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X64-AVX-NEXT: retq %ptr = bitcast x86_mmx* %a1 to <2 x float>* %ld = load <2 x float>, <2 x float>* %ptr %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> @@ -927,28 +1191,47 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) { } define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) { -; X32-LABEL: test_mm_loadl_pi: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X32-NEXT: movaps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_loadl_pi: -; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrq $32, %rax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_loadl_pi: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; X86-SSE-NEXT: movaps %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_mm_loadl_pi: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_loadl_pi: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-AVX512-NEXT: retl +; +; X64-SSE-LABEL: test_mm_loadl_pi: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movq (%rdi), %rax +; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: shrq $32, %rax +; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; X64-SSE-NEXT: movaps %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_loadl_pi: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; X64-AVX-NEXT: retq %ptr = bitcast x86_mmx* %a1 to <2 x float>* %ld = load <2 x float>, <2 x float>* %ptr %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> @@ -957,18 +1240,29 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) { } define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind { -; X32-LABEL: test_mm_loadr_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps (%eax), %xmm0 -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_loadr_ps: -; X64: # %bb.0: -; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_loadr_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps (%eax), %xmm0 +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_loadr_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_loadr_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps (%rdi), %xmm0 +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_loadr_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; X64-AVX-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* %ld = load <4 x float>, <4 x float>* %arg0, align 16 %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> @@ -976,162 +1270,178 @@ define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind { } define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind { -; X32-LABEL: test_mm_loadu_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movups (%eax), %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_loadu_ps: -; X64: # %bb.0: -; X64-NEXT: movups (%rdi), %xmm0 -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_loadu_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups (%eax), %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_loadu_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovups (%eax), %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_loadu_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups (%rdi), %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_loadu_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups (%rdi), %xmm0 +; X64-AVX-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* %res = load <4 x float>, <4 x float>* %arg0, align 1 ret <4 x float> %res } define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) { -; X32-LABEL: test_mm_max_ps: -; X32: # %bb.0: -; X32-NEXT: maxps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_max_ps: -; X64: # %bb.0: -; X64-NEXT: maxps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_max_ps: +; SSE: # %bb.0: +; SSE-NEXT: maxps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_max_ps: +; AVX: # %bb.0: +; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) { -; X32-LABEL: test_mm_max_ss: -; X32: # %bb.0: -; X32-NEXT: maxss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_max_ss: -; X64: # %bb.0: -; X64-NEXT: maxss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_max_ss: +; SSE: # %bb.0: +; SSE-NEXT: maxss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_max_ss: +; AVX: # %bb.0: +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) { -; X32-LABEL: test_mm_min_ps: -; X32: # %bb.0: -; X32-NEXT: minps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_min_ps: -; X64: # %bb.0: -; X64-NEXT: minps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_min_ps: +; SSE: # %bb.0: +; SSE-NEXT: minps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_min_ps: +; AVX: # %bb.0: +; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) { -; X32-LABEL: test_mm_min_ss: -; X32: # %bb.0: -; X32-NEXT: minss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_min_ss: -; X64: # %bb.0: -; X64-NEXT: minss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_min_ss: +; SSE: # %bb.0: +; SSE-NEXT: minss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_min_ss: +; AVX: # %bb.0: +; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) { -; X32-LABEL: test_mm_move_ss: -; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_move_ss: -; X64: # %bb.0: -; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X64-NEXT: retq +; SSE-LABEL: test_mm_move_ss: +; SSE: # %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: test_mm_move_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: test_mm_move_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3> ret <4 x float> %res } define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) { -; X32-LABEL: test_mm_movehl_ps: -; X32: # %bb.0: -; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_movehl_ps: -; X64: # %bb.0: -; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; X64-NEXT: retq +; SSE-LABEL: test_mm_movehl_ps: +; SSE: # %bb.0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_movehl_ps: +; AVX: # %bb.0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> ret <4 x float> %res } define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) { -; X32-LABEL: test_mm_movelh_ps: -; X32: # %bb.0: -; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_movelh_ps: -; X64: # %bb.0: -; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: retq +; SSE-LABEL: test_mm_movelh_ps: +; SSE: # %bb.0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_movelh_ps: +; AVX: # %bb.0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x float> %res } define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind { -; X32-LABEL: test_mm_movemask_ps: -; X32: # %bb.0: -; X32-NEXT: movmskps %xmm0, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_movemask_ps: -; X64: # %bb.0: -; X64-NEXT: movmskps %xmm0, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_movemask_ps: +; SSE: # %bb.0: +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_movemask_ps: +; AVX: # %bb.0: +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ret i32 %res } declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_mul_ps: -; X32: # %bb.0: -; X32-NEXT: mulps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_mul_ps: -; X64: # %bb.0: -; X64-NEXT: mulps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_mul_ps: +; SSE: # %bb.0: +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_mul_ps: +; AVX: # %bb.0: +; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = fmul <4 x float> %a0, %a1 ret <4 x float> %res } define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_mul_ss: -; X32: # %bb.0: -; X32-NEXT: mulss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_mul_ss: -; X64: # %bb.0: -; X64-NEXT: mulss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_mul_ss: +; SSE: # %bb.0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_mul_ss: +; AVX: # %bb.0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %ext0 = extractelement <4 x float> %a0, i32 0 %ext1 = extractelement <4 x float> %a1, i32 0 %fmul = fmul float %ext0, %ext1 @@ -1140,15 +1450,15 @@ define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind { } define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_or_ps: -; X32: # %bb.0: -; X32-NEXT: orps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_or_ps: -; X64: # %bb.0: -; X64-NEXT: orps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_or_ps: +; SSE: # %bb.0: +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_or_ps: +; AVX: # %bb.0: +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> %res = or <4 x i32> %arg0, %arg1 @@ -1157,11 +1467,11 @@ define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind { } define void @test_mm_prefetch(i8* %a0) { -; X32-LABEL: test_mm_prefetch: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: prefetchnta (%eax) -; X32-NEXT: retl +; X86-LABEL: test_mm_prefetch: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: prefetchnta (%eax) +; X86-NEXT: retl ; ; X64-LABEL: test_mm_prefetch: ; X64: # %bb.0: @@ -1173,90 +1483,115 @@ define void @test_mm_prefetch(i8* %a0) { declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) { -; X32-LABEL: test_mm_rcp_ps: -; X32: # %bb.0: -; X32-NEXT: rcpps %xmm0, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_rcp_ps: -; X64: # %bb.0: -; X64-NEXT: rcpps %xmm0, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_rcp_ps: +; SSE: # %bb.0: +; SSE-NEXT: rcpps %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_rcp_ps: +; AVX: # %bb.0: +; AVX-NEXT: vrcpps %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) { -; X32-LABEL: test_mm_rcp_ss: -; X32: # %bb.0: -; X32-NEXT: rcpss %xmm0, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_rcp_ss: -; X64: # %bb.0: -; X64-NEXT: rcpss %xmm0, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_rcp_ss: +; SSE: # %bb.0: +; SSE-NEXT: rcpss %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_rcp_ss: +; AVX: # %bb.0: +; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ret <4 x float> %rcp } declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) { -; X32-LABEL: test_mm_rsqrt_ps: -; X32: # %bb.0: -; X32-NEXT: rsqrtps %xmm0, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_rsqrt_ps: -; X64: # %bb.0: -; X64-NEXT: rsqrtps %xmm0, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_rsqrt_ps: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtps %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_rsqrt_ps: +; AVX: # %bb.0: +; AVX-NEXT: vrsqrtps %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) { -; X32-LABEL: test_mm_rsqrt_ss: -; X32: # %bb.0: -; X32-NEXT: rsqrtss %xmm0, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_rsqrt_ss: -; X64: # %bb.0: -; X64-NEXT: rsqrtss %xmm0, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_rsqrt_ss: +; SSE: # %bb.0: +; SSE-NEXT: rsqrtss %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_rsqrt_ss: +; AVX: # %bb.0: +; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ret <4 x float> %rsqrt } declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind { -; X32-LABEL: test_MM_SET_EXCEPTION_MASK: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %esp, %ecx -; X32-NEXT: stmxcsr (%ecx) -; X32-NEXT: movl (%esp), %edx -; X32-NEXT: andl $-8065, %edx # imm = 0xE07F -; X32-NEXT: orl %eax, %edx -; X32-NEXT: movl %edx, (%esp) -; X32-NEXT: ldmxcsr (%ecx) -; X32-NEXT: popl %eax -; X32-NEXT: retl -; -; X64-LABEL: test_MM_SET_EXCEPTION_MASK: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: andl $-8065, %ecx # imm = 0xE07F -; X64-NEXT: orl %edi, %ecx -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: ldmxcsr (%rax) -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_SET_EXCEPTION_MASK: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl %esp, %ecx +; X86-SSE-NEXT: stmxcsr (%ecx) +; X86-SSE-NEXT: movl (%esp), %edx +; X86-SSE-NEXT: andl $-8065, %edx # imm = 0xE07F +; X86-SSE-NEXT: orl %eax, %edx +; X86-SSE-NEXT: movl %edx, (%esp) +; X86-SSE-NEXT: ldmxcsr (%ecx) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_SET_EXCEPTION_MASK: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl %esp, %ecx +; X86-AVX-NEXT: vstmxcsr (%ecx) +; X86-AVX-NEXT: movl (%esp), %edx +; X86-AVX-NEXT: andl $-8065, %edx # imm = 0xE07F +; X86-AVX-NEXT: orl %eax, %edx +; X86-AVX-NEXT: movl %edx, (%esp) +; X86-AVX-NEXT: vldmxcsr (%ecx) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_SET_EXCEPTION_MASK: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-SSE-NEXT: andl $-8065, %ecx # imm = 0xE07F +; X64-SSE-NEXT: orl %edi, %ecx +; X64-SSE-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: ldmxcsr (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_SET_EXCEPTION_MASK: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-AVX-NEXT: andl $-8065, %ecx # imm = 0xE07F +; X64-AVX-NEXT: orl %edi, %ecx +; X64-AVX-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vldmxcsr (%rax) +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -1270,30 +1605,55 @@ define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind { declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind { -; X32-LABEL: test_MM_SET_EXCEPTION_STATE: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %esp, %ecx -; X32-NEXT: stmxcsr (%ecx) -; X32-NEXT: movl (%esp), %edx -; X32-NEXT: andl $-64, %edx -; X32-NEXT: orl %eax, %edx -; X32-NEXT: movl %edx, (%esp) -; X32-NEXT: ldmxcsr (%ecx) -; X32-NEXT: popl %eax -; X32-NEXT: retl -; -; X64-LABEL: test_MM_SET_EXCEPTION_STATE: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: andl $-64, %ecx -; X64-NEXT: orl %edi, %ecx -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: ldmxcsr (%rax) -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_SET_EXCEPTION_STATE: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl %esp, %ecx +; X86-SSE-NEXT: stmxcsr (%ecx) +; X86-SSE-NEXT: movl (%esp), %edx +; X86-SSE-NEXT: andl $-64, %edx +; X86-SSE-NEXT: orl %eax, %edx +; X86-SSE-NEXT: movl %edx, (%esp) +; X86-SSE-NEXT: ldmxcsr (%ecx) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_SET_EXCEPTION_STATE: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl %esp, %ecx +; X86-AVX-NEXT: vstmxcsr (%ecx) +; X86-AVX-NEXT: movl (%esp), %edx +; X86-AVX-NEXT: andl $-64, %edx +; X86-AVX-NEXT: orl %eax, %edx +; X86-AVX-NEXT: movl %edx, (%esp) +; X86-AVX-NEXT: vldmxcsr (%ecx) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_SET_EXCEPTION_STATE: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-SSE-NEXT: andl $-64, %ecx +; X64-SSE-NEXT: orl %edi, %ecx +; X64-SSE-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: ldmxcsr (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_SET_EXCEPTION_STATE: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-AVX-NEXT: andl $-64, %ecx +; X64-AVX-NEXT: orl %edi, %ecx +; X64-AVX-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vldmxcsr (%rax) +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -1306,30 +1666,55 @@ define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind { } define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind { -; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %esp, %ecx -; X32-NEXT: stmxcsr (%ecx) -; X32-NEXT: movl (%esp), %edx -; X32-NEXT: andl $-32769, %edx # imm = 0xFFFF7FFF -; X32-NEXT: orl %eax, %edx -; X32-NEXT: movl %edx, (%esp) -; X32-NEXT: ldmxcsr (%ecx) -; X32-NEXT: popl %eax -; X32-NEXT: retl -; -; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: andl $-32769, %ecx # imm = 0xFFFF7FFF -; X64-NEXT: orl %edi, %ecx -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: ldmxcsr (%rax) -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl %esp, %ecx +; X86-SSE-NEXT: stmxcsr (%ecx) +; X86-SSE-NEXT: movl (%esp), %edx +; X86-SSE-NEXT: andl $-32769, %edx # imm = 0xFFFF7FFF +; X86-SSE-NEXT: orl %eax, %edx +; X86-SSE-NEXT: movl %edx, (%esp) +; X86-SSE-NEXT: ldmxcsr (%ecx) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl %esp, %ecx +; X86-AVX-NEXT: vstmxcsr (%ecx) +; X86-AVX-NEXT: movl (%esp), %edx +; X86-AVX-NEXT: andl $-32769, %edx # imm = 0xFFFF7FFF +; X86-AVX-NEXT: orl %eax, %edx +; X86-AVX-NEXT: movl %edx, (%esp) +; X86-AVX-NEXT: vldmxcsr (%ecx) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-SSE-NEXT: andl $-32769, %ecx # imm = 0xFFFF7FFF +; X64-SSE-NEXT: orl %edi, %ecx +; X64-SSE-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: ldmxcsr (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-AVX-NEXT: andl $-32769, %ecx # imm = 0xFFFF7FFF +; X64-AVX-NEXT: orl %edi, %ecx +; X64-AVX-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vldmxcsr (%rax) +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -1342,24 +1727,42 @@ define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind { } define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind { -; X32-LABEL: test_mm_set_ps: -; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_set_ps: -; X64: # %bb.0: -; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; X64-NEXT: movaps %xmm3, %xmm0 -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_set_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_set_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_set_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; X64-SSE-NEXT: movaps %xmm3, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_set_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; X64-AVX-NEXT: retq %res0 = insertelement <4 x float> undef, float %a3, i32 0 %res1 = insertelement <4 x float> %res0, float %a2, i32 1 %res2 = insertelement <4 x float> %res1, float %a1, i32 2 @@ -1368,16 +1771,38 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n } define <4 x float> @test_mm_set_ps1(float %a0) nounwind { -; X32-LABEL: test_mm_set_ps1: -; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_set_ps1: -; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_set_ps1: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_mm_set_ps1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_set_ps1: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-SSE-LABEL: test_mm_set_ps1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_mm_set_ps1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_set_ps1: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X64-AVX512-NEXT: retq %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float %a0, i32 1 %res2 = insertelement <4 x float> %res1, float %a0, i32 2 @@ -1386,30 +1811,55 @@ define <4 x float> @test_mm_set_ps1(float %a0) nounwind { } define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind { -; X32-LABEL: test_MM_SET_ROUNDING_MODE: -; X32: # %bb.0: -; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %esp, %ecx -; X32-NEXT: stmxcsr (%ecx) -; X32-NEXT: movl (%esp), %edx -; X32-NEXT: andl $-24577, %edx # imm = 0x9FFF -; X32-NEXT: orl %eax, %edx -; X32-NEXT: movl %edx, (%esp) -; X32-NEXT: ldmxcsr (%ecx) -; X32-NEXT: popl %eax -; X32-NEXT: retl -; -; X64-LABEL: test_MM_SET_ROUNDING_MODE: -; X64: # %bb.0: -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: stmxcsr (%rax) -; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: andl $-24577, %ecx # imm = 0x9FFF -; X64-NEXT: orl %edi, %ecx -; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: ldmxcsr (%rax) -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_SET_ROUNDING_MODE: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl %esp, %ecx +; X86-SSE-NEXT: stmxcsr (%ecx) +; X86-SSE-NEXT: movl (%esp), %edx +; X86-SSE-NEXT: andl $-24577, %edx # imm = 0x9FFF +; X86-SSE-NEXT: orl %eax, %edx +; X86-SSE-NEXT: movl %edx, (%esp) +; X86-SSE-NEXT: ldmxcsr (%ecx) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_SET_ROUNDING_MODE: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl %esp, %ecx +; X86-AVX-NEXT: vstmxcsr (%ecx) +; X86-AVX-NEXT: movl (%esp), %edx +; X86-AVX-NEXT: andl $-24577, %edx # imm = 0x9FFF +; X86-AVX-NEXT: orl %eax, %edx +; X86-AVX-NEXT: movl %edx, (%esp) +; X86-AVX-NEXT: vldmxcsr (%ecx) +; X86-AVX-NEXT: popl %eax +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_SET_ROUNDING_MODE: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: stmxcsr (%rax) +; X64-SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-SSE-NEXT: andl $-24577, %ecx # imm = 0x9FFF +; X64-SSE-NEXT: orl %edi, %ecx +; X64-SSE-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: ldmxcsr (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_SET_ROUNDING_MODE: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vstmxcsr (%rax) +; X64-AVX-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; X64-AVX-NEXT: andl $-24577, %ecx # imm = 0x9FFF +; X64-AVX-NEXT: orl %edi, %ecx +; X64-AVX-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vldmxcsr (%rax) +; X64-AVX-NEXT: retq %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -1422,19 +1872,45 @@ define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind { } define <4 x float> @test_mm_set_ss(float %a0) nounwind { -; X32-LABEL: test_mm_set_ss: -; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_set_ss: -; X64: # %bb.0: -; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_set_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_mm_set_ss: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_set_ss: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X86-AVX512-NEXT: retl +; +; X64-SSE-LABEL: test_mm_set_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: xorps %xmm1, %xmm1 +; X64-SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-SSE-NEXT: movaps %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_mm_set_ss: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_set_ss: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64-AVX512-NEXT: retq %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 %res2 = insertelement <4 x float> %res1, float 0.0, i32 2 @@ -1443,16 +1919,38 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind { } define <4 x float> @test_mm_set1_ps(float %a0) nounwind { -; X32-LABEL: test_mm_set1_ps: -; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_set1_ps: -; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_set1_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_mm_set1_ps: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_set1_ps: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-SSE-LABEL: test_mm_set1_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_mm_set1_ps: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_set1_ps: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X64-AVX512-NEXT: retq %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float %a0, i32 1 %res2 = insertelement <4 x float> %res1, float %a0, i32 2 @@ -1461,18 +1959,31 @@ define <4 x float> @test_mm_set1_ps(float %a0) nounwind { } define void @test_mm_setcsr(i32 %a0) nounwind { -; X32-LABEL: test_mm_setcsr: -; X32: # %bb.0: -; X32-NEXT: leal {{[0-9]+}}(%esp), %eax -; X32-NEXT: ldmxcsr (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_setcsr: -; X64: # %bb.0: -; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: ldmxcsr (%rax) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_setcsr: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: ldmxcsr (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_setcsr: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vldmxcsr (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_setcsr: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: ldmxcsr (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_setcsr: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-AVX-NEXT: vldmxcsr (%rax) +; X64-AVX-NEXT: retq %st = alloca i32, align 4 store i32 %a0, i32* %st, align 4 %bc = bitcast i32* %st to i8* @@ -1481,23 +1992,41 @@ define void @test_mm_setcsr(i32 %a0) nounwind { } define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { -; X32-LABEL: test_mm_setr_ps: -; X32: # %bb.0: -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_setr_ps: -; X64: # %bb.0: -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_setr_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_setr_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_setr_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_setr_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; X64-AVX-NEXT: retq %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float %a1, i32 1 %res2 = insertelement <4 x float> %res1, float %a2, i32 2 @@ -1506,106 +2035,138 @@ define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) } define <4 x float> @test_mm_setzero_ps() { -; X32-LABEL: test_mm_setzero_ps: -; X32: # %bb.0: -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_setzero_ps: -; X64: # %bb.0: -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_setzero_ps: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_setzero_ps: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} ret <4 x float> zeroinitializer } define void @test_mm_sfence() nounwind { -; X32-LABEL: test_mm_sfence: -; X32: # %bb.0: -; X32-NEXT: sfence -; X32-NEXT: retl -; -; X64-LABEL: test_mm_sfence: -; X64: # %bb.0: -; X64-NEXT: sfence -; X64-NEXT: retq +; CHECK-LABEL: test_mm_sfence: +; CHECK: # %bb.0: +; CHECK-NEXT: sfence +; CHECK-NEXT: ret{{[l|q]}} call void @llvm.x86.sse.sfence() ret void } declare void @llvm.x86.sse.sfence() nounwind readnone define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_shuffle_ps: -; X32: # %bb.0: -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_shuffle_ps: -; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; X64-NEXT: retq +; SSE-LABEL: test_mm_shuffle_ps: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_shuffle_ps: +; AVX: # %bb.0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4> ret <4 x float> %res } define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) { -; X32-LABEL: test_mm_sqrt_ps: -; X32: # %bb.0: -; X32-NEXT: sqrtps %xmm0, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_sqrt_ps: -; X64: # %bb.0: -; X64-NEXT: sqrtps %xmm0, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_sqrt_ps: +; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_sqrt_ps: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ret <4 x float> %res } declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) { -; X32-LABEL: test_mm_sqrt_ss: -; X32: # %bb.0: -; X32-NEXT: sqrtss %xmm0, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_sqrt_ss: -; X64: # %bb.0: -; X64-NEXT: sqrtss %xmm0, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_sqrt_ss: +; SSE: # %bb.0: +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_sqrt_ss: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ret <4 x float> %sqrt } declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone define void @test_mm_store_ps(float *%a0, <4 x float> %a1) { -; X32-LABEL: test_mm_store_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_store_ps: -; X64: # %bb.0: -; X64-NEXT: movaps %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_store_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_store_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_store_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_store_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* store <4 x float> %a1, <4 x float>* %arg0, align 16 ret void } define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) { -; X32-LABEL: test_mm_store_ps1: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: movaps %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_store_ps1: -; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: movaps %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_store_ps1: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_mm_store_ps1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_store_ps1: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-SSE-LABEL: test_mm_store_ps1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_mm_store_ps1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_store_ps1: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX512-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer store <4 x float> %shuf, <4 x float>* %arg0, align 16 @@ -1613,34 +2174,71 @@ define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) { } define void @test_mm_store_ss(float *%a0, <4 x float> %a1) { -; X32-LABEL: test_mm_store_ss: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movss %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_store_ss: -; X64: # %bb.0: -; X64-NEXT: movss %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_store_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movss %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_store_ss: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovss %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_store_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movss %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_store_ss: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovss %xmm0, (%rdi) +; X64-AVX-NEXT: retq %ext = extractelement <4 x float> %a1, i32 0 store float %ext, float* %a0, align 1 ret void } define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) { -; X32-LABEL: test_mm_store1_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: movaps %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_store1_ps: -; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: movaps %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_store1_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_mm_store1_ps: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test_mm_store1_ps: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-SSE-LABEL: test_mm_store1_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_mm_store1_ps: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test_mm_store1_ps: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX512-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer store <4 x float> %shuf, <4 x float>* %arg0, align 16 @@ -1648,28 +2246,40 @@ define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) { } define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_storeh_ps: -; X32: # %bb.0: -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $32, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movaps %xmm0, (%esp) -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, 4(%eax) -; X32-NEXT: movl %ecx, (%eax) -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: test_mm_storeh_ps: -; X64: # %bb.0: -; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: movq %rax, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_storeh_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $32, %esp +; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%esp) +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl %edx, 4(%eax) +; X86-SSE-NEXT: movl %ecx, (%eax) +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_storeh_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovhpd %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_storeh_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: movq %rax, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_storeh_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX-NEXT: movq %rax, (%rdi) +; X64-AVX-NEXT: retq %ptr = bitcast x86_mmx* %a0 to i64* %bc = bitcast <4 x float> %a1 to <2 x i64> %ext = extractelement <2 x i64> %bc, i32 1 @@ -1678,28 +2288,40 @@ define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { } define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_storel_ps: -; X32: # %bb.0: -; X32-NEXT: pushl %ebp -; X32-NEXT: movl %esp, %ebp -; X32-NEXT: andl $-16, %esp -; X32-NEXT: subl $32, %esp -; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: movaps %xmm0, (%esp) -; X32-NEXT: movl (%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl %edx, 4(%eax) -; X32-NEXT: movl %ecx, (%eax) -; X32-NEXT: movl %ebp, %esp -; X32-NEXT: popl %ebp -; X32-NEXT: retl -; -; X64-LABEL: test_mm_storel_ps: -; X64: # %bb.0: -; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: movq %rax, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_storel_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $32, %esp +; X86-SSE-NEXT: movl 8(%ebp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%esp) +; X86-SSE-NEXT: movl (%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl %edx, 4(%eax) +; X86-SSE-NEXT: movl %ecx, (%eax) +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_storel_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovlps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_storel_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-SSE-NEXT: movq %rax, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_storel_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovq %xmm0, %rax +; X64-AVX-NEXT: movq %rax, (%rdi) +; X64-AVX-NEXT: retq %ptr = bitcast x86_mmx* %a0 to i64* %bc = bitcast <4 x float> %a1 to <2 x i64> %ext = extractelement <2 x i64> %bc, i32 0 @@ -1708,18 +2330,31 @@ define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { } define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) { -; X32-LABEL: test_mm_storer_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X32-NEXT: movaps %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_storer_ps: -; X64: # %bb.0: -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; X64-NEXT: movaps %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_storer_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_storer_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_storer_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_storer_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> store <4 x float> %shuf, <4 x float>* %arg0, align 16 @@ -1727,61 +2362,83 @@ define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) { } define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) { -; X32-LABEL: test_mm_storeu_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movups %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_storeu_ps: -; X64: # %bb.0: -; X64-NEXT: movups %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_storeu_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_storeu_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_storeu_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_storeu_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) +; X64-AVX-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* store <4 x float> %a1, <4 x float>* %arg0, align 1 ret void } define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) { -; X32-LABEL: test_mm_stream_ps: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movntps %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_mm_stream_ps: -; X64: # %bb.0: -; X64-NEXT: movntps %xmm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: test_mm_stream_ps: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movntps %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_mm_stream_ps: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovntps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_mm_stream_ps: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movntps %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_mm_stream_ps: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovntps %xmm0, (%rdi) +; X64-AVX-NEXT: retq %arg0 = bitcast float* %a0 to <4 x float>* store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0 ret void } define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_sub_ps: -; X32: # %bb.0: -; X32-NEXT: subps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_sub_ps: -; X64: # %bb.0: -; X64-NEXT: subps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_sub_ps: +; SSE: # %bb.0: +; SSE-NEXT: subps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_sub_ps: +; AVX: # %bb.0: +; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %res = fsub <4 x float> %a0, %a1 ret <4 x float> %res } define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_sub_ss: -; X32: # %bb.0: -; X32-NEXT: subss %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_sub_ss: -; X64: # %bb.0: -; X64-NEXT: subss %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_sub_ss: +; SSE: # %bb.0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_sub_ss: +; AVX: # %bb.0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %ext0 = extractelement <4 x float> %a0, i32 0 %ext1 = extractelement <4 x float> %a1, i32 0 %fsub = fsub float %ext0, %ext1 @@ -1790,59 +2447,105 @@ define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind { } define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind { -; X32-LABEL: test_MM_TRANSPOSE4_PS: -; X32: # %bb.0: -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movaps (%esi), %xmm0 -; X32-NEXT: movaps (%edx), %xmm1 -; X32-NEXT: movaps (%ecx), %xmm2 -; X32-NEXT: movaps (%eax), %xmm3 -; X32-NEXT: movaps %xmm0, %xmm4 -; X32-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; X32-NEXT: movaps %xmm2, %xmm5 -; X32-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X32-NEXT: movaps %xmm4, %xmm1 -; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; X32-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] -; X32-NEXT: movaps %xmm0, %xmm3 -; X32-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X32-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; X32-NEXT: movaps %xmm1, (%esi) -; X32-NEXT: movaps %xmm5, (%edx) -; X32-NEXT: movaps %xmm3, (%ecx) -; X32-NEXT: movaps %xmm2, (%eax) -; X32-NEXT: popl %esi -; X32-NEXT: retl -; -; X64-LABEL: test_MM_TRANSPOSE4_PS: -; X64: # %bb.0: -; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: movaps (%rsi), %xmm1 -; X64-NEXT: movaps (%rdx), %xmm2 -; X64-NEXT: movaps (%rcx), %xmm3 -; X64-NEXT: movaps %xmm0, %xmm4 -; X64-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; X64-NEXT: movaps %xmm2, %xmm5 -; X64-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; X64-NEXT: movaps %xmm4, %xmm1 -; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; X64-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] -; X64-NEXT: movaps %xmm0, %xmm3 -; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X64-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; X64-NEXT: movaps %xmm1, (%rdi) -; X64-NEXT: movaps %xmm5, (%rsi) -; X64-NEXT: movaps %xmm3, (%rdx) -; X64-NEXT: movaps %xmm2, (%rcx) -; X64-NEXT: retq +; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE-NEXT: movaps (%esi), %xmm0 +; X86-SSE-NEXT: movaps (%edx), %xmm1 +; X86-SSE-NEXT: movaps (%ecx), %xmm2 +; X86-SSE-NEXT: movaps (%eax), %xmm3 +; X86-SSE-NEXT: movaps %xmm0, %xmm4 +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-SSE-NEXT: movaps %xmm2, %xmm5 +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X86-SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-SSE-NEXT: movaps %xmm4, %xmm1 +; X86-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; X86-SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] +; X86-SSE-NEXT: movaps %xmm0, %xmm3 +; X86-SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X86-SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; X86-SSE-NEXT: movaps %xmm1, (%esi) +; X86-SSE-NEXT: movaps %xmm5, (%edx) +; X86-SSE-NEXT: movaps %xmm3, (%ecx) +; X86-SSE-NEXT: movaps %xmm2, (%eax) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_MM_TRANSPOSE4_PS: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-AVX-NEXT: vmovaps (%esi), %xmm0 +; X86-AVX-NEXT: vmovaps (%edx), %xmm1 +; X86-AVX-NEXT: vmovaps (%ecx), %xmm2 +; X86-AVX-NEXT: vmovaps (%eax), %xmm3 +; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm5[0] +; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1] +; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] +; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; X86-AVX-NEXT: vmovaps %xmm2, (%esi) +; X86-AVX-NEXT: vmovaps %xmm3, (%edx) +; X86-AVX-NEXT: vmovaps %xmm4, (%ecx) +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_MM_TRANSPOSE4_PS: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps (%rdi), %xmm0 +; X64-SSE-NEXT: movaps (%rsi), %xmm1 +; X64-SSE-NEXT: movaps (%rdx), %xmm2 +; X64-SSE-NEXT: movaps (%rcx), %xmm3 +; X64-SSE-NEXT: movaps %xmm0, %xmm4 +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-SSE-NEXT: movaps %xmm2, %xmm5 +; X64-SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X64-SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-SSE-NEXT: movaps %xmm4, %xmm1 +; X64-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; X64-SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] +; X64-SSE-NEXT: movaps %xmm0, %xmm3 +; X64-SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X64-SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; X64-SSE-NEXT: movaps %xmm1, (%rdi) +; X64-SSE-NEXT: movaps %xmm5, (%rsi) +; X64-SSE-NEXT: movaps %xmm3, (%rdx) +; X64-SSE-NEXT: movaps %xmm2, (%rcx) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_MM_TRANSPOSE4_PS: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 +; X64-AVX-NEXT: vmovaps (%rsi), %xmm1 +; X64-AVX-NEXT: vmovaps (%rdx), %xmm2 +; X64-AVX-NEXT: vmovaps (%rcx), %xmm3 +; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm5[0] +; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1] +; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] +; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; X64-AVX-NEXT: vmovaps %xmm2, (%rdi) +; X64-AVX-NEXT: vmovaps %xmm3, (%rsi) +; X64-AVX-NEXT: vmovaps %xmm4, (%rdx) +; X64-AVX-NEXT: vmovaps %xmm0, (%rcx) +; X64-AVX-NEXT: retq %row0 = load <4 x float>, <4 x float>* %a0, align 16 %row1 = load <4 x float>, <4 x float>* %a1, align 16 %row2 = load <4 x float>, <4 x float>* %a2, align 16 @@ -1863,176 +2566,172 @@ define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x floa } define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_ucomieq_ss: -; X32: # %bb.0: -; X32-NEXT: ucomiss %xmm1, %xmm0 -; X32-NEXT: setnp %al -; X32-NEXT: sete %cl -; X32-NEXT: andb %al, %cl -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_ucomieq_ss: -; X64: # %bb.0: -; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: setnp %al -; X64-NEXT: sete %cl -; X64-NEXT: andb %al, %cl -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_ucomieq_ss: +; SSE: # %bb.0: +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: setnp %al +; SSE-NEXT: sete %cl +; SSE-NEXT: andb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_ucomieq_ss: +; AVX: # %bb.0: +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: setnp %al +; AVX-NEXT: sete %cl +; AVX-NEXT: andb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_ucomige_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: ucomiss %xmm1, %xmm0 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_ucomige_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: setae %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_ucomige_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: setae %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_ucomige_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: setae %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_ucomigt_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: ucomiss %xmm1, %xmm0 -; X32-NEXT: seta %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_ucomigt_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: seta %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_ucomigt_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: seta %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_ucomigt_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: seta %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_ucomile_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: ucomiss %xmm0, %xmm1 -; X32-NEXT: setae %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_ucomile_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: ucomiss %xmm0, %xmm1 -; X64-NEXT: setae %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_ucomile_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: ucomiss %xmm0, %xmm1 +; SSE-NEXT: setae %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_ucomile_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: setae %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_ucomilt_ss: -; X32: # %bb.0: -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: ucomiss %xmm0, %xmm1 -; X32-NEXT: seta %al -; X32-NEXT: retl -; -; X64-LABEL: test_mm_ucomilt_ss: -; X64: # %bb.0: -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: ucomiss %xmm0, %xmm1 -; X64-NEXT: seta %al -; X64-NEXT: retq +; SSE-LABEL: test_mm_ucomilt_ss: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: ucomiss %xmm0, %xmm1 +; SSE-NEXT: seta %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_ucomilt_ss: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: seta %al +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_ucomineq_ss: -; X32: # %bb.0: -; X32-NEXT: ucomiss %xmm1, %xmm0 -; X32-NEXT: setp %al -; X32-NEXT: setne %cl -; X32-NEXT: orb %al, %cl -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: retl -; -; X64-LABEL: test_mm_ucomineq_ss: -; X64: # %bb.0: -; X64-NEXT: ucomiss %xmm1, %xmm0 -; X64-NEXT: setp %al -; X64-NEXT: setne %cl -; X64-NEXT: orb %al, %cl -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: retq +; SSE-LABEL: test_mm_ucomineq_ss: +; SSE: # %bb.0: +; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: setp %al +; SSE-NEXT: setne %cl +; SSE-NEXT: orb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_ucomineq_ss: +; AVX: # %bb.0: +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: setp %al +; AVX-NEXT: setne %cl +; AVX-NEXT: orb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: ret{{[l|q]}} %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ret i32 %res } declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_mm_undefined_ps() { -; X32-LABEL: test_mm_undefined_ps: -; X32: # %bb.0: -; X32-NEXT: retl -; -; X64-LABEL: test_mm_undefined_ps: -; X64: # %bb.0: -; X64-NEXT: retq +; CHECK-LABEL: test_mm_undefined_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: ret{{[l|q]}} ret <4 x float> undef } define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_unpackhi_ps: -; X32: # %bb.0: -; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_unpackhi_ps: -; X64: # %bb.0: -; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: retq +; SSE-LABEL: test_mm_unpackhi_ps: +; SSE: # %bb.0: +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_unpackhi_ps: +; AVX: # %bb.0: +; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> ret <4 x float> %res } define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_unpacklo_ps: -; X32: # %bb.0: -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: retl -; -; X64-LABEL: test_mm_unpacklo_ps: -; X64: # %bb.0: -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: retq +; SSE-LABEL: test_mm_unpacklo_ps: +; SSE: # %bb.0: +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_unpacklo_ps: +; AVX: # %bb.0: +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> ret <4 x float> %res } define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: test_mm_xor_ps: -; X32: # %bb.0: -; X32-NEXT: xorps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: test_mm_xor_ps: -; X64: # %bb.0: -; X64-NEXT: xorps %xmm1, %xmm0 -; X64-NEXT: retq +; SSE-LABEL: test_mm_xor_ps: +; SSE: # %bb.0: +; SSE-NEXT: xorps %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: test_mm_xor_ps: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> %res = xor <4 x i32> %arg0, %arg1 diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll index 47c3c0b2261..60a455ae148 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll @@ -1,12 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1 +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_sse_storeu_ps: -; CHECK: ## %bb.0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movups %xmm0, (%eax) -; CHECK-NEXT: retl +; X86-SSE-LABEL: test_x86_sse_storeu_ps: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-SSE-NEXT: movups %xmm0, (%eax) ## encoding: [0x0f,0x11,0x00] +; X86-SSE-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX1-LABEL: test_x86_sse_storeu_ps: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX1-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512-LABEL: test_x86_sse_storeu_ps: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] +; X86-AVX512-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: test_x86_sse_storeu_ps: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: movups %xmm0, (%rdi) ## encoding: [0x0f,0x11,0x07] +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: test_x86_sse_storeu_ps: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: test_x86_sse_storeu_ps: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] +; X64-AVX512-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1) ret void } @@ -14,10 +46,20 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_sse_add_ss: -; CHECK: ## %bb.0: -; CHECK-NEXT: addss %xmm1, %xmm0 -; CHECK-NEXT: retl +; SSE-LABEL: test_x86_sse_add_ss: +; SSE: ## %bb.0: +; SSE-NEXT: addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_add_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_add_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -25,10 +67,20 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_sse_sub_ss: -; CHECK: ## %bb.0: -; CHECK-NEXT: subss %xmm1, %xmm0 -; CHECK-NEXT: retl +; SSE-LABEL: test_x86_sse_sub_ss: +; SSE: ## %bb.0: +; SSE-NEXT: subss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5c,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_sub_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5c,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_sub_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5c,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -36,10 +88,20 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_sse_mul_ss: -; CHECK: ## %bb.0: -; CHECK-NEXT: mulss %xmm1, %xmm0 -; CHECK-NEXT: retl +; SSE-LABEL: test_x86_sse_mul_ss: +; SSE: ## %bb.0: +; SSE-NEXT: mulss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x59,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_mul_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x59,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_mul_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -47,10 +109,20 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: test_x86_sse_div_ss: -; CHECK: ## %bb.0: -; CHECK-NEXT: divss %xmm1, %xmm0 -; CHECK-NEXT: retl +; SSE-LABEL: test_x86_sse_div_ss: +; SSE: ## %bb.0: +; SSE-NEXT: divss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5e,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_div_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5e,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_div_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5e,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -58,10 +130,35 @@ declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind read define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0, i32 %a1) { -; CHECK-LABEL: test_x86_sse_cvtsi2ss: -; CHECK: ## %bb.0: -; CHECK-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: retl +; X86-SSE-LABEL: test_x86_sse_cvtsi2ss: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xf3,0x0f,0x2a,0x44,0x24,0x04] +; X86-SSE-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX1-LABEL: test_x86_sse_cvtsi2ss: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04] +; X86-AVX1-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512-LABEL: test_x86_sse_cvtsi2ss: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04] +; X86-AVX512-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: test_x86_sse_cvtsi2ss: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: cvtsi2ssl %edi, %xmm0 ## encoding: [0xf3,0x0f,0x2a,0xc7] +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: test_x86_sse_cvtsi2ss: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x2a,0xc7] +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: test_x86_sse_cvtsi2ss: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc7] +; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll index 3eb64698905..0014da6b2ec 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll @@ -1,18 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2 -; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1 +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_cmp_ps: ; SSE: ## %bb.0: ; SSE-NEXT: cmpordps %xmm1, %xmm0 ## encoding: [0x0f,0xc2,0xc1,0x07] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_cmp_ps: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: vcmpordps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse_cmp_ps: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpordps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x07] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -23,12 +26,12 @@ define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_cmp_ss: ; SSE: ## %bb.0: ; SSE-NEXT: cmpordss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0xc2,0xc1,0x07] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_cmp_ss: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: vcmpordss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xc2,0xc1,0x07] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse_cmp_ss: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpordss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xc2,0xc1,0x07] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -43,25 +46,25 @@ define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] ; SSE-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] ; SSE-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_comieq_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] -; AVX2-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] -; AVX2-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] -; AVX2-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] -; AVX2-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_comieq_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] -; SKX-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] -; SKX-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] -; SKX-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] -; SKX-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_comieq_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX1-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] +; AVX1-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] +; AVX1-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] +; AVX1-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_comieq_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX512-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] +; AVX512-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] +; AVX512-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] +; AVX512-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -74,21 +77,21 @@ define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1] ; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_comige_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] -; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_comige_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] -; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_comige_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX1-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_comige_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX512-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -101,21 +104,21 @@ define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1] ; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_comigt_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] -; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_comigt_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] -; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_comigt_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX1-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_comigt_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX512-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -128,21 +131,21 @@ define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: comiss %xmm0, %xmm1 ## encoding: [0x0f,0x2f,0xc8] ; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_comile_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8] -; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_comile_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8] -; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_comile_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8] +; AVX1-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_comile_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8] +; AVX512-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -155,21 +158,21 @@ define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: comiss %xmm0, %xmm1 ## encoding: [0x0f,0x2f,0xc8] ; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_comilt_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8] -; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_comilt_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8] -; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_comilt_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8] +; AVX1-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_comilt_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8] +; AVX512-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -184,25 +187,25 @@ define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] ; SSE-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] ; SSE-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_comineq_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] -; AVX2-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; AVX2-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; AVX2-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; AVX2-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_comineq_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] -; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; SKX-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_comineq_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX1-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] +; AVX1-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] +; AVX1-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] +; AVX1-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_comineq_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1] +; AVX512-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] +; AVX512-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] +; AVX512-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] +; AVX512-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -213,17 +216,17 @@ define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_cvtss2si: ; SSE: ## %bb.0: ; SSE-NEXT: cvtss2si %xmm0, %eax ## encoding: [0xf3,0x0f,0x2d,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_cvtss2si: -; AVX2: ## %bb.0: -; AVX2-NEXT: vcvtss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2d,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_cvtss2si: +; AVX1: ## %bb.0: +; AVX1-NEXT: vcvtss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2d,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_cvtss2si: -; SKX: ## %bb.0: -; SKX-NEXT: vcvtss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_cvtss2si: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcvtss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1] ret i32 %res } @@ -234,17 +237,17 @@ define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_cvttss2si: ; SSE: ## %bb.0: ; SSE-NEXT: cvttss2si %xmm0, %eax ## encoding: [0xf3,0x0f,0x2c,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_cvttss2si: -; AVX2: ## %bb.0: -; AVX2-NEXT: vcvttss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2c,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_cvttss2si: +; AVX1: ## %bb.0: +; AVX1-NEXT: vcvttss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2c,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_cvttss2si: -; SKX: ## %bb.0: -; SKX-NEXT: vcvttss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_cvttss2si: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcvttss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1] ret i32 %res } @@ -252,17 +255,27 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone define void @test_x86_sse_ldmxcsr(i8* %a0) { -; SSE-LABEL: test_x86_sse_ldmxcsr: -; SSE: ## %bb.0: -; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; SSE-NEXT: ldmxcsr (%eax) ## encoding: [0x0f,0xae,0x10] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; VCHECK-LABEL: test_x86_sse_ldmxcsr: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; VCHECK-NEXT: vldmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x10] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; X86-SSE-LABEL: test_x86_sse_ldmxcsr: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-SSE-NEXT: ldmxcsr (%eax) ## encoding: [0x0f,0xae,0x10] +; X86-SSE-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX-LABEL: test_x86_sse_ldmxcsr: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX-NEXT: vldmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x10] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: test_x86_sse_ldmxcsr: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: ldmxcsr (%rdi) ## encoding: [0x0f,0xae,0x17] +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_sse_ldmxcsr: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vldmxcsr (%rdi) ## encoding: [0xc5,0xf8,0xae,0x17] +; X64-AVX-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.sse.ldmxcsr(i8* %a0) ret void } @@ -274,17 +287,17 @@ define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_max_ps: ; SSE: ## %bb.0: ; SSE-NEXT: maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_max_ps: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_max_ps: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_max_ps: -; SKX: ## %bb.0: -; SKX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_max_ps: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -295,17 +308,17 @@ define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_max_ss: ; SSE: ## %bb.0: ; SSE-NEXT: maxss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5f,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_max_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_max_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_max_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_max_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -316,17 +329,17 @@ define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_min_ps: ; SSE: ## %bb.0: ; SSE-NEXT: minps %xmm1, %xmm0 ## encoding: [0x0f,0x5d,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_min_ps: -; AVX2: ## %bb.0: -; AVX2-NEXT: vminps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5d,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_min_ps: +; AVX1: ## %bb.0: +; AVX1-NEXT: vminps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5d,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_min_ps: -; SKX: ## %bb.0: -; SKX-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_min_ps: +; AVX512: ## %bb.0: +; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -337,17 +350,17 @@ define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-LABEL: test_x86_sse_min_ss: ; SSE: ## %bb.0: ; SSE-NEXT: minss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5d,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_min_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_min_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_min_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vminss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_min_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -358,12 +371,12 @@ define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_movmsk_ps: ; SSE: ## %bb.0: ; SSE-NEXT: movmskps %xmm0, %eax ## encoding: [0x0f,0x50,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_movmsk_ps: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: vmovmskps %xmm0, %eax ## encoding: [0xc5,0xf8,0x50,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse_movmsk_ps: +; AVX: ## %bb.0: +; AVX-NEXT: vmovmskps %xmm0, %eax ## encoding: [0xc5,0xf8,0x50,0xc0] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1] ret i32 %res } @@ -375,12 +388,12 @@ define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_rcp_ps: ; SSE: ## %bb.0: ; SSE-NEXT: rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_rcp_ps: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse_rcp_ps: +; AVX: ## %bb.0: +; AVX-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -391,12 +404,12 @@ define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_rcp_ss: ; SSE: ## %bb.0: ; SSE-NEXT: rcpss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x53,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_rcp_ss: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: vrcpss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x53,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse_rcp_ss: +; AVX: ## %bb.0: +; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x53,0xc0] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -407,12 +420,12 @@ define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_rsqrt_ps: ; SSE: ## %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_rsqrt_ps: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse_rsqrt_ps: +; AVX: ## %bb.0: +; AVX-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -423,12 +436,12 @@ define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_rsqrt_ss: ; SSE: ## %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x52,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_rsqrt_ss: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x52,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX-LABEL: test_x86_sse_rsqrt_ss: +; AVX: ## %bb.0: +; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x52,0xc0] +; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -439,17 +452,17 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_sqrt_ps: ; SSE: ## %bb.0: ; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_sqrt_ps: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_sqrt_ps: +; AVX1: ## %bb.0: +; AVX1-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_sqrt_ps: -; SKX: ## %bb.0: -; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_sqrt_ps: +; AVX512: ## %bb.0: +; AVX512-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -460,17 +473,17 @@ define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_sqrt_ss: ; SSE: ## %bb.0: ; SSE-NEXT: sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_sqrt_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_sqrt_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_sqrt_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_sqrt_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -478,17 +491,27 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone define void @test_x86_sse_stmxcsr(i8* %a0) { -; SSE-LABEL: test_x86_sse_stmxcsr: -; SSE: ## %bb.0: -; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; SSE-NEXT: stmxcsr (%eax) ## encoding: [0x0f,0xae,0x18] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; VCHECK-LABEL: test_x86_sse_stmxcsr: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; VCHECK-NEXT: vstmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x18] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; X86-SSE-LABEL: test_x86_sse_stmxcsr: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-SSE-NEXT: stmxcsr (%eax) ## encoding: [0x0f,0xae,0x18] +; X86-SSE-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX-LABEL: test_x86_sse_stmxcsr: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX-NEXT: vstmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x18] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: test_x86_sse_stmxcsr: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: stmxcsr (%rdi) ## encoding: [0x0f,0xae,0x1f] +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_sse_stmxcsr: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vstmxcsr (%rdi) ## encoding: [0xc5,0xf8,0xae,0x1f] +; X64-AVX-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.sse.stmxcsr(i8* %a0) ret void } @@ -503,25 +526,25 @@ define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] ; SSE-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] ; SSE-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_ucomieq_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] -; AVX2-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] -; AVX2-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] -; AVX2-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] -; AVX2-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_ucomieq_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; SKX-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] -; SKX-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] -; SKX-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] -; SKX-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_ucomieq_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX1-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] +; AVX1-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] +; AVX1-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] +; AVX1-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_ucomieq_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX512-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0] +; AVX512-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1] +; AVX512-NEXT: andb %al, %cl ## encoding: [0x20,0xc1] +; AVX512-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -534,21 +557,21 @@ define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1] ; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_ucomige_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] -; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_ucomige_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_ucomige_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX1-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_ucomige_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX512-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -561,21 +584,21 @@ define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1] ; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_ucomigt_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] -; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_ucomigt_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_ucomigt_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX1-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_ucomigt_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX512-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -588,21 +611,21 @@ define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: ucomiss %xmm0, %xmm1 ## encoding: [0x0f,0x2e,0xc8] ; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_ucomile_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8] -; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_ucomile_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8] -; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_ucomile_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8] +; AVX1-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_ucomile_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8] +; AVX512-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -615,21 +638,21 @@ define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] ; SSE-NEXT: ucomiss %xmm0, %xmm1 ## encoding: [0x0f,0x2e,0xc8] ; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_ucomilt_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX2-NEXT: vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8] -; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_ucomilt_ss: -; SKX: ## %bb.0: -; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; SKX-NEXT: vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8] -; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_ucomilt_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX1-NEXT: vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8] +; AVX1-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_ucomilt_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; AVX512-NEXT: vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8] +; AVX512-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -644,25 +667,25 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] ; SSE-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] ; SSE-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_ucomineq_ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] -; AVX2-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; AVX2-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; AVX2-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; AVX2-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_ucomineq_ss: -; SKX: ## %bb.0: -; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] -; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] -; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] -; SKX-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] -; SKX-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] -; SKX-NEXT: retl ## encoding: [0xc3] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_ucomineq_ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX1-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] +; AVX1-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] +; AVX1-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] +; AVX1-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_ucomineq_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1] +; AVX512-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0] +; AVX512-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1] +; AVX512-NEXT: orb %al, %cl ## encoding: [0x08,0xc1] +; AVX512-NEXT: movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1] ret i32 %res } @@ -670,15 +693,10 @@ declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnon define void @sfence() nounwind { -; SSE-LABEL: sfence: -; SSE: ## %bb.0: -; SSE-NEXT: sfence ## encoding: [0x0f,0xae,0xf8] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; VCHECK-LABEL: sfence: -; VCHECK: ## %bb.0: -; VCHECK-NEXT: sfence ## encoding: [0x0f,0xae,0xf8] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: sfence: +; CHECK: ## %bb.0: +; CHECK-NEXT: sfence ## encoding: [0x0f,0xae,0xf8] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] tail call void @llvm.x86.sse.sfence() ret void } diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll index 1ce5bbf94dd..3de61c5e55d 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) { ; SSE-LABEL: test_x86_sse_cvtsi642ss: @@ -9,15 +9,15 @@ define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) { ; SSE-NEXT: cvtsi2ssq %rdi, %xmm0 ## encoding: [0xf3,0x48,0x0f,0x2a,0xc7] ; SSE-NEXT: retq ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_cvtsi642ss: -; AVX2: ## %bb.0: -; AVX2-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfa,0x2a,0xc7] -; AVX2-NEXT: retq ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_cvtsi642ss: +; AVX1: ## %bb.0: +; AVX1-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfa,0x2a,0xc7] +; AVX1-NEXT: retq ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_cvtsi642ss: -; SKX: ## %bb.0: -; SKX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2a,0xc7] -; SKX-NEXT: retq ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_cvtsi642ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2a,0xc7] +; AVX512-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll index 161047ccfe9..6851abc286a 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_cvtss2si64: @@ -9,15 +9,15 @@ define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) { ; SSE-NEXT: cvtss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2d,0xc0] ; SSE-NEXT: retq ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_cvtss2si64: -; AVX2: ## %bb.0: -; AVX2-NEXT: vcvtss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2d,0xc0] -; AVX2-NEXT: retq ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_cvtss2si64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vcvtss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2d,0xc0] +; AVX1-NEXT: retq ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_cvtss2si64: -; SKX: ## %bb.0: -; SKX-NEXT: vcvtss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2d,0xc0] -; SKX-NEXT: retq ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_cvtss2si64: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcvtss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2d,0xc0] +; AVX512-NEXT: retq ## encoding: [0xc3] %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1] ret i64 %res } @@ -30,15 +30,15 @@ define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) { ; SSE-NEXT: cvttss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2c,0xc0] ; SSE-NEXT: retq ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_cvttss2si64: -; AVX2: ## %bb.0: -; AVX2-NEXT: vcvttss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2c,0xc0] -; AVX2-NEXT: retq ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse_cvttss2si64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vcvttss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2c,0xc0] +; AVX1-NEXT: retq ## encoding: [0xc3] ; -; SKX-LABEL: test_x86_sse_cvttss2si64: -; SKX: ## %bb.0: -; SKX-NEXT: vcvttss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2c,0xc0] -; SKX-NEXT: retq ## encoding: [0xc3] +; AVX512-LABEL: test_x86_sse_cvttss2si64: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcvttss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2c,0xc0] +; AVX512-NEXT: retq ## encoding: [0xc3] %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1] ret i64 %res } diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll index 1ed4d3401ca..629e0aabbe1 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll @@ -1,7 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck --check-prefix=SSE %s -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse4.1 < %s | FileCheck --check-prefix=SSE %s -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck --check-prefix=AVX %s +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 ; PR21507 - https://llvm.org/bugs/show_bug.cgi?id=21507 ; Each function should be a single math op; no extra moves. @@ -11,12 +16,12 @@ define <4 x float> @recip(<4 x float> %x) { ; SSE-LABEL: recip: ; SSE: # %bb.0: ; SSE-NEXT: rcpss %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: recip: ; AVX: # %bb.0: ; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %y = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %x) %shuf = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %shuf @@ -26,12 +31,12 @@ define <4 x float> @recip_square_root(<4 x float> %x) { ; SSE-LABEL: recip_square_root: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: recip_square_root: ; AVX: # %bb.0: ; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %y = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %x) %shuf = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %shuf @@ -41,12 +46,12 @@ define <4 x float> @square_root(<4 x float> %x) { ; SSE-LABEL: square_root: ; SSE: # %bb.0: ; SSE-NEXT: sqrtss %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: square_root: ; AVX: # %bb.0: ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %y = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x) %shuf = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %shuf @@ -56,12 +61,12 @@ define <2 x double> @square_root_double(<2 x double> %x) { ; SSE-LABEL: square_root_double: ; SSE: # %bb.0: ; SSE-NEXT: sqrtsd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: square_root_double: ; AVX: # %bb.0: ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %y = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %x) %shuf = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 0, i32 3> ret <2 x double> %shuf diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll index 476d1befe1d..1a294daf1ea 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 ; Ensure that the backend no longer emits unnecessary vector insert ; instructions immediately after SSE scalar fp instructions @@ -12,12 +16,12 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: test_add_ss: ; SSE: # %bb.0: ; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_add_ss: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %add = fadd float %2, %1 @@ -29,12 +33,12 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: test_sub_ss: ; SSE: # %bb.0: ; SSE-NEXT: subss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_sub_ss: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %sub = fsub float %2, %1 @@ -46,12 +50,12 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: test_mul_ss: ; SSE: # %bb.0: ; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_mul_ss: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %mul = fmul float %2, %1 @@ -63,12 +67,12 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: test_div_ss: ; SSE: # %bb.0: ; SSE-NEXT: divss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_div_ss: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %div = fdiv float %2, %1 @@ -81,25 +85,25 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: sqrtss %xmm0, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} ; ; SSE41-LABEL: test_sqrt_ss: ; SSE41: # %bb.0: ; SSE41-NEXT: sqrtss %xmm0, %xmm1 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE41-NEXT: retq +; SSE41-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_sqrt_ss: ; AVX1: # %bb.0: ; AVX1-NEXT: vsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_sqrt_ss: ; AVX512: # %bb.0: ; AVX512-NEXT: vsqrtss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX512-NEXT: retq +; AVX512-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %a, i32 0 %2 = call float @llvm.sqrt.f32(float %1) %3 = insertelement <4 x float> %a, float %2, i32 0 @@ -111,12 +115,12 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: test_add_sd: ; SSE: # %bb.0: ; SSE-NEXT: addsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_add_sd: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %add = fadd double %2, %1 @@ -128,12 +132,12 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: test_sub_sd: ; SSE: # %bb.0: ; SSE-NEXT: subsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_sub_sd: ; AVX: # %bb.0: ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %sub = fsub double %2, %1 @@ -145,12 +149,12 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: test_mul_sd: ; SSE: # %bb.0: ; SSE-NEXT: mulsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_mul_sd: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %mul = fmul double %2, %1 @@ -162,12 +166,12 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: test_div_sd: ; SSE: # %bb.0: ; SSE-NEXT: divsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_div_sd: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %div = fdiv double %2, %1 @@ -180,25 +184,25 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a) { ; SSE2: # %bb.0: ; SSE2-NEXT: sqrtsd %xmm0, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: retq +; SSE2-NEXT: ret{{[l|q]}} ; ; SSE41-LABEL: test_sqrt_sd: ; SSE41: # %bb.0: ; SSE41-NEXT: sqrtsd %xmm0, %xmm1 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE41-NEXT: retq +; SSE41-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_sqrt_sd: ; AVX1: # %bb.0: ; AVX1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AVX1-NEXT: retq +; AVX1-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: test_sqrt_sd: ; AVX512: # %bb.0: ; AVX512-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AVX512-NEXT: retq +; AVX512-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %a, i32 0 %2 = call double @llvm.sqrt.f64(double %1) %3 = insertelement <2 x double> %a, double %2, i32 0 @@ -211,12 +215,12 @@ define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: addss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_add_ss: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %add = fadd float %1, %2 @@ -229,12 +233,12 @@ define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_sub_ss: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %sub = fsub float %2, %1 @@ -247,12 +251,12 @@ define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: mulss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_mul_ss: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %mul = fmul float %1, %2 @@ -265,12 +269,12 @@ define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: divss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_div_ss: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %div = fdiv float %2, %1 @@ -283,12 +287,12 @@ define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: addsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_add_sd: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %add = fadd double %1, %2 @@ -301,12 +305,12 @@ define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: subsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_sub_sd: ; AVX: # %bb.0: ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %sub = fsub double %2, %1 @@ -319,12 +323,12 @@ define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: mulsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_mul_sd: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %mul = fmul double %1, %2 @@ -337,12 +341,12 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: divsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test2_div_sd: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %div = fdiv double %2, %1 @@ -355,13 +359,13 @@ define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: addss %xmm0, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_multiple_add_ss: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %add = fadd float %2, %1 @@ -376,13 +380,13 @@ define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) { ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: subss %xmm1, %xmm2 ; SSE-NEXT: subss %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_multiple_sub_ss: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %sub = fsub float %2, %1 @@ -396,13 +400,13 @@ define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: mulss %xmm0, %xmm1 ; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_multiple_mul_ss: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %mul = fmul float %2, %1 @@ -417,13 +421,13 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: divss %xmm1, %xmm2 ; SSE-NEXT: divss %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: test_multiple_div_ss: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %div = fdiv float %2, %1 @@ -436,15 +440,27 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { ; be lowered to X86Blendi nodes. define <4 x float> @blend_add_ss(<4 x float> %a, float %b) { -; SSE-LABEL: blend_add_ss: -; SSE: # %bb.0: -; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_add_ss: -; AVX: # %bb.0: -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_add_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: addss %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_add_ss: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_add_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addss %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_add_ss: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <4 x float> %a, i32 0 %op = fadd float %b, %ext @@ -454,15 +470,27 @@ define <4 x float> @blend_add_ss(<4 x float> %a, float %b) { } define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) { -; SSE-LABEL: blend_sub_ss: -; SSE: # %bb.0: -; SSE-NEXT: subss %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_sub_ss: -; AVX: # %bb.0: -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_sub_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: subss %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_sub_ss: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_sub_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: subss %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_sub_ss: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <4 x float> %a, i32 0 %op = fsub float %ext, %b @@ -472,15 +500,27 @@ define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) { } define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) { -; SSE-LABEL: blend_mul_ss: -; SSE: # %bb.0: -; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_mul_ss: -; AVX: # %bb.0: -; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_mul_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: mulss %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_mul_ss: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_mul_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: mulss %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_mul_ss: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <4 x float> %a, i32 0 %op = fmul float %b, %ext @@ -490,15 +530,27 @@ define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) { } define <4 x float> @blend_div_ss(<4 x float> %a, float %b) { -; SSE-LABEL: blend_div_ss: -; SSE: # %bb.0: -; SSE-NEXT: divss %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_div_ss: -; AVX: # %bb.0: -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_div_ss: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: divss %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_div_ss: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_div_ss: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: divss %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_div_ss: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <4 x float> %a, i32 0 %op = fdiv float %ext, %b @@ -508,15 +560,27 @@ define <4 x float> @blend_div_ss(<4 x float> %a, float %b) { } define <2 x double> @blend_add_sd(<2 x double> %a, double %b) { -; SSE-LABEL: blend_add_sd: -; SSE: # %bb.0: -; SSE-NEXT: addsd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_add_sd: -; AVX: # %bb.0: -; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_add_sd: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: addsd %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_add_sd: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_add_sd: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addsd %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_add_sd: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <2 x double> %a, i32 0 %op = fadd double %b, %ext @@ -526,15 +590,27 @@ define <2 x double> @blend_add_sd(<2 x double> %a, double %b) { } define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) { -; SSE-LABEL: blend_sub_sd: -; SSE: # %bb.0: -; SSE-NEXT: subsd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_sub_sd: -; AVX: # %bb.0: -; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_sub_sd: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: subsd %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_sub_sd: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_sub_sd: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: subsd %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_sub_sd: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <2 x double> %a, i32 0 %op = fsub double %ext, %b @@ -544,15 +620,27 @@ define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) { } define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) { -; SSE-LABEL: blend_mul_sd: -; SSE: # %bb.0: -; SSE-NEXT: mulsd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_mul_sd: -; AVX: # %bb.0: -; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_mul_sd: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: mulsd %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_mul_sd: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_mul_sd: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: mulsd %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_mul_sd: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <2 x double> %a, i32 0 %op = fmul double %b, %ext @@ -562,15 +650,27 @@ define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) { } define <2 x double> @blend_div_sd(<2 x double> %a, double %b) { -; SSE-LABEL: blend_div_sd: -; SSE: # %bb.0: -; SSE-NEXT: divsd %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: blend_div_sd: -; AVX: # %bb.0: -; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; X86-SSE-LABEL: blend_div_sd: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: divsd %xmm1, %xmm0 +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: blend_div_sd: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: blend_div_sd: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: divsd %xmm1, %xmm0 +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: blend_div_sd: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %ext = extractelement <2 x double> %a, i32 0 %op = fdiv double %ext, %b @@ -586,12 +686,12 @@ define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test_add_ss: ; SSE: # %bb.0: ; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_add_ss: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <4 x float> %a, %b %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -601,12 +701,12 @@ define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test_sub_ss: ; SSE: # %bb.0: ; SSE-NEXT: subss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_sub_ss: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <4 x float> %a, %b %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -616,12 +716,12 @@ define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test_mul_ss: ; SSE: # %bb.0: ; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_mul_ss: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <4 x float> %a, %b %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -631,12 +731,12 @@ define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test_div_ss: ; SSE: # %bb.0: ; SSE-NEXT: divss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_div_ss: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <4 x float> %a, %b %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -646,12 +746,12 @@ define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test_add_sd: ; SSE: # %bb.0: ; SSE-NEXT: addsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_add_sd: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <2 x double> %a, %b %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -661,12 +761,12 @@ define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test_sub_sd: ; SSE: # %bb.0: ; SSE-NEXT: subsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_sub_sd: ; AVX: # %bb.0: ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <2 x double> %a, %b %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -676,12 +776,12 @@ define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test_mul_sd: ; SSE: # %bb.0: ; SSE-NEXT: mulsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_mul_sd: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <2 x double> %a, %b %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -691,12 +791,12 @@ define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test_div_sd: ; SSE: # %bb.0: ; SSE-NEXT: divsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test_div_sd: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <2 x double> %a, %b %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -707,12 +807,12 @@ define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: addss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_add_ss: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <4 x float> %b, %a %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -723,12 +823,12 @@ define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_sub_ss: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <4 x float> %b, %a %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -739,12 +839,12 @@ define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: mulss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_mul_ss: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <4 x float> %b, %a %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -755,12 +855,12 @@ define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: divss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_div_ss: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <4 x float> %b, %a %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> ret <4 x float> %2 @@ -771,12 +871,12 @@ define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: addsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_add_sd: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <2 x double> %b, %a %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -787,12 +887,12 @@ define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: subsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_sub_sd: ; AVX: # %bb.0: ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <2 x double> %b, %a %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -803,12 +903,12 @@ define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: mulsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_mul_sd: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <2 x double> %b, %a %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -819,12 +919,12 @@ define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: divsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test2_div_sd: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <2 x double> %b, %a %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> ret <2 x double> %2 @@ -834,12 +934,12 @@ define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test3_add_ss: ; SSE: # %bb.0: ; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_add_ss: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <4 x float> %a, %b %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 ret <4 x float> %2 @@ -849,12 +949,12 @@ define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test3_sub_ss: ; SSE: # %bb.0: ; SSE-NEXT: subss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_sub_ss: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <4 x float> %a, %b %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 ret <4 x float> %2 @@ -864,12 +964,12 @@ define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test3_mul_ss: ; SSE: # %bb.0: ; SSE-NEXT: mulss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_mul_ss: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <4 x float> %a, %b %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 ret <4 x float> %2 @@ -879,12 +979,12 @@ define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: insert_test3_div_ss: ; SSE: # %bb.0: ; SSE-NEXT: divss %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_div_ss: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <4 x float> %a, %b %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 ret <4 x float> %2 @@ -894,12 +994,12 @@ define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test3_add_sd: ; SSE: # %bb.0: ; SSE-NEXT: addsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_add_sd: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <2 x double> %a, %b %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 ret <2 x double> %2 @@ -909,12 +1009,12 @@ define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test3_sub_sd: ; SSE: # %bb.0: ; SSE-NEXT: subsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_sub_sd: ; AVX: # %bb.0: ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <2 x double> %a, %b %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 ret <2 x double> %2 @@ -924,12 +1024,12 @@ define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test3_mul_sd: ; SSE: # %bb.0: ; SSE-NEXT: mulsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_mul_sd: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <2 x double> %a, %b %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 ret <2 x double> %2 @@ -939,12 +1039,12 @@ define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: insert_test3_div_sd: ; SSE: # %bb.0: ; SSE-NEXT: divsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test3_div_sd: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <2 x double> %a, %b %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 ret <2 x double> %2 @@ -955,12 +1055,12 @@ define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: addss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_add_ss: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <4 x float> %b, %a %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 ret <4 x float> %2 @@ -971,12 +1071,12 @@ define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: subss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_sub_ss: ; AVX: # %bb.0: ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <4 x float> %b, %a %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 ret <4 x float> %2 @@ -987,12 +1087,12 @@ define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: mulss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_mul_ss: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <4 x float> %b, %a %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 ret <4 x float> %2 @@ -1003,12 +1103,12 @@ define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) { ; SSE: # %bb.0: ; SSE-NEXT: divss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_div_ss: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <4 x float> %b, %a %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 ret <4 x float> %2 @@ -1019,12 +1119,12 @@ define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: addsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_add_sd: ; AVX: # %bb.0: ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fadd <2 x double> %b, %a %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 ret <2 x double> %2 @@ -1035,12 +1135,12 @@ define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: subsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_sub_sd: ; AVX: # %bb.0: ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fsub <2 x double> %b, %a %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 ret <2 x double> %2 @@ -1051,12 +1151,12 @@ define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: mulsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_mul_sd: ; AVX: # %bb.0: ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fmul <2 x double> %b, %a %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 ret <2 x double> %2 @@ -1067,58 +1167,100 @@ define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) { ; SSE: # %bb.0: ; SSE-NEXT: divsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: insert_test4_div_sd: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX-NEXT: ret{{[l|q]}} %1 = fdiv <2 x double> %b, %a %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 ret <2 x double> %2 } define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { -; SSE2-LABEL: add_ss_mask: -; SSE2: # %bb.0: -; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: jne .LBB62_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE2-NEXT: retq -; SSE2-NEXT: .LBB62_1: -; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: retq -; -; SSE41-LABEL: add_ss_mask: -; SSE41: # %bb.0: -; SSE41-NEXT: testb $1, %dil -; SSE41-NEXT: jne .LBB62_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB62_1: -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE41-NEXT: retq -; -; AVX1-LABEL: add_ss_mask: -; AVX1: # %bb.0: -; AVX1-NEXT: testb $1, %dil -; AVX1-NEXT: je .LBB62_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: .LBB62_2: -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; AVX1-NEXT: retq -; -; AVX512-LABEL: add_ss_mask: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} -; AVX512-NEXT: vmovaps %xmm2, %xmm0 -; AVX512-NEXT: retq +; X86-SSE2-LABEL: add_ss_mask: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: jne .LBB62_1 +; X86-SSE2-NEXT: # %bb.2: +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; X86-SSE2-NEXT: retl +; X86-SSE2-NEXT: .LBB62_1: +; X86-SSE2-NEXT: addss %xmm0, %xmm1 +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: add_ss_mask: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-SSE41-NEXT: jne .LBB62_1 +; X86-SSE41-NEXT: # %bb.2: +; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; X86-SSE41-NEXT: retl +; X86-SSE41-NEXT: .LBB62_1: +; X86-SSE41-NEXT: addss %xmm0, %xmm1 +; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X86-SSE41-NEXT: retl +; +; X86-AVX1-LABEL: add_ss_mask: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: je .LBB62_2 +; X86-AVX1-NEXT: # %bb.1: +; X86-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: .LBB62_2: +; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: add_ss_mask: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-AVX512-NEXT: kmovw %eax, %k1 +; X86-AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} +; X86-AVX512-NEXT: vmovaps %xmm2, %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-SSE2-LABEL: add_ss_mask: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: testb $1, %dil +; X64-SSE2-NEXT: jne .LBB62_1 +; X64-SSE2-NEXT: # %bb.2: +; X64-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; X64-SSE2-NEXT: retq +; X64-SSE2-NEXT: .LBB62_1: +; X64-SSE2-NEXT: addss %xmm0, %xmm1 +; X64-SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-SSE2-NEXT: retq +; +; X64-SSE41-LABEL: add_ss_mask: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: testb $1, %dil +; X64-SSE41-NEXT: jne .LBB62_1 +; X64-SSE41-NEXT: # %bb.2: +; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; X64-SSE41-NEXT: retq +; X64-SSE41-NEXT: .LBB62_1: +; X64-SSE41-NEXT: addss %xmm0, %xmm1 +; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-SSE41-NEXT: retq +; +; X64-AVX1-LABEL: add_ss_mask: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: testb $1, %dil +; X64-AVX1-NEXT: je .LBB62_2 +; X64-AVX1-NEXT: # %bb.1: +; X64-AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: .LBB62_2: +; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: add_ss_mask: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: kmovw %edi, %k1 +; X64-AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} +; X64-AVX512-NEXT: vmovaps %xmm2, %xmm0 +; X64-AVX512-NEXT: retq %1 = extractelement <4 x float> %a, i64 0 %2 = extractelement <4 x float> %b, i64 0 %3 = fadd float %1, %2 @@ -1131,46 +1273,88 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, } define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { -; SSE2-LABEL: add_sd_mask: -; SSE2: # %bb.0: -; SSE2-NEXT: testb $1, %dil -; SSE2-NEXT: jne .LBB63_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE2-NEXT: retq -; SSE2-NEXT: .LBB63_1: -; SSE2-NEXT: addsd %xmm0, %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: add_sd_mask: -; SSE41: # %bb.0: -; SSE41-NEXT: testb $1, %dil -; SSE41-NEXT: jne .LBB63_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB63_1: -; SSE41-NEXT: addsd %xmm0, %xmm1 -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE41-NEXT: retq -; -; AVX1-LABEL: add_sd_mask: -; AVX1: # %bb.0: -; AVX1-NEXT: testb $1, %dil -; AVX1-NEXT: je .LBB63_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: .LBB63_2: -; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; AVX1-NEXT: retq -; -; AVX512-LABEL: add_sd_mask: -; AVX512: # %bb.0: -; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} -; AVX512-NEXT: vmovapd %xmm2, %xmm0 -; AVX512-NEXT: retq +; X86-SSE2-LABEL: add_sd_mask: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: jne .LBB63_1 +; X86-SSE2-NEXT: # %bb.2: +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X86-SSE2-NEXT: retl +; X86-SSE2-NEXT: .LBB63_1: +; X86-SSE2-NEXT: addsd %xmm0, %xmm1 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: add_sd_mask: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-SSE41-NEXT: jne .LBB63_1 +; X86-SSE41-NEXT: # %bb.2: +; X86-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; X86-SSE41-NEXT: retl +; X86-SSE41-NEXT: .LBB63_1: +; X86-SSE41-NEXT: addsd %xmm0, %xmm1 +; X86-SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE41-NEXT: retl +; +; X86-AVX1-LABEL: add_sd_mask: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: je .LBB63_2 +; X86-AVX1-NEXT: # %bb.1: +; X86-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: .LBB63_2: +; X86-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: add_sd_mask: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-AVX512-NEXT: kmovw %eax, %k1 +; X86-AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} +; X86-AVX512-NEXT: vmovapd %xmm2, %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-SSE2-LABEL: add_sd_mask: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: testb $1, %dil +; X64-SSE2-NEXT: jne .LBB63_1 +; X64-SSE2-NEXT: # %bb.2: +; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X64-SSE2-NEXT: retq +; X64-SSE2-NEXT: .LBB63_1: +; X64-SSE2-NEXT: addsd %xmm0, %xmm1 +; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-SSE2-NEXT: retq +; +; X64-SSE41-LABEL: add_sd_mask: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: testb $1, %dil +; X64-SSE41-NEXT: jne .LBB63_1 +; X64-SSE41-NEXT: # %bb.2: +; X64-SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; X64-SSE41-NEXT: retq +; X64-SSE41-NEXT: .LBB63_1: +; X64-SSE41-NEXT: addsd %xmm0, %xmm1 +; X64-SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-SSE41-NEXT: retq +; +; X64-AVX1-LABEL: add_sd_mask: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: testb $1, %dil +; X64-AVX1-NEXT: je .LBB63_2 +; X64-AVX1-NEXT: # %bb.1: +; X64-AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: .LBB63_2: +; X64-AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: add_sd_mask: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: kmovw %edi, %k1 +; X64-AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} +; X64-AVX512-NEXT: vmovapd %xmm2, %xmm0 +; X64-AVX512-NEXT: retq %1 = extractelement <2 x double> %a, i64 0 %2 = extractelement <2 x double> %b, i64 0 %3 = fadd double %1, %2 diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll index b405b8aa2f5..2859387de04 100644 --- a/llvm/test/CodeGen/X86/sse1.ll +++ b/llvm/test/CodeGen/X86/sse1.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse -O3 | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+sse -O3 | FileCheck %s --check-prefixes=CHECK,X64 + ; Tests for SSE1 and below, without SSE2+. -; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 -O3 | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+sse -O3 | FileCheck %s --check-prefix=X64 ; PR7993 ;define <4 x i32> @test3(<4 x i16> %a) nounwind { @@ -13,25 +14,15 @@ ; vector that this ends up returning. ; rdar://8368414 define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind { -; X32-LABEL: test4: -; X32: # %bb.0: # %entry -; X32-NEXT: movaps %xmm0, %xmm2 -; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] -; X32-NEXT: addss %xmm1, %xmm0 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X32-NEXT: subss %xmm1, %xmm2 -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: retl -; -; X64-LABEL: test4: -; X64: # %bb.0: # %entry -; X64-NEXT: movaps %xmm0, %xmm2 -; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] -; X64-NEXT: addss %xmm1, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X64-NEXT: subss %xmm1, %xmm2 -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: retq +; CHECK-LABEL: test4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; CHECK-NEXT: subss %xmm1, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: ret{{[l|q]}} entry: %tmp7 = extractelement <2 x float> %A, i32 0 %tmp5 = extractelement <2 x float> %A, i32 1 @@ -51,44 +42,44 @@ entry: ; PR18036 define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { -; X32-LABEL: vselect: -; X32: # %bb.0: # %entry -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: je .LBB1_1 -; X32-NEXT: # %bb.2: # %entry -; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X32-NEXT: jne .LBB1_5 -; X32-NEXT: .LBB1_4: -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X32-NEXT: jne .LBB1_8 -; X32-NEXT: .LBB1_7: -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X32-NEXT: je .LBB1_10 -; X32-NEXT: jmp .LBB1_11 -; X32-NEXT: .LBB1_1: -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB1_4 -; X32-NEXT: .LBB1_5: # %entry -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X32-NEXT: je .LBB1_7 -; X32-NEXT: .LBB1_8: # %entry -; X32-NEXT: xorps %xmm3, %xmm3 -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X32-NEXT: jne .LBB1_11 -; X32-NEXT: .LBB1_10: -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: .LBB1_11: # %entry -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X32-NEXT: retl +; X86-LABEL: vselect: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: xorps %xmm0, %xmm0 +; X86-NEXT: je .LBB1_1 +; X86-NEXT: # %bb.2: # %entry +; X86-NEXT: xorps %xmm1, %xmm1 +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB1_5 +; X86-NEXT: .LBB1_4: +; X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB1_8 +; X86-NEXT: .LBB1_7: +; X86-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: je .LBB1_10 +; X86-NEXT: jmp .LBB1_11 +; X86-NEXT: .LBB1_1: +; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB1_4 +; X86-NEXT: .LBB1_5: # %entry +; X86-NEXT: xorps %xmm2, %xmm2 +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: je .LBB1_7 +; X86-NEXT: .LBB1_8: # %entry +; X86-NEXT: xorps %xmm3, %xmm3 +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: jne .LBB1_11 +; X86-NEXT: .LBB1_10: +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: .LBB1_11: # %entry +; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X86-NEXT: retl ; ; X64-LABEL: vselect: ; X64: # %bb.0: # %entry @@ -137,15 +128,10 @@ entry: ; v4i32 isn't legal for SSE1, but this should be cmpps. define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind { -; X32-LABEL: PR28044: -; X32: # %bb.0: -; X32-NEXT: cmpeqps %xmm1, %xmm0 -; X32-NEXT: retl -; -; X64-LABEL: PR28044: -; X64: # %bb.0: -; X64-NEXT: cmpeqps %xmm1, %xmm0 -; X64-NEXT: retq +; CHECK-LABEL: PR28044: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpeqps %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} %cmp = fcmp oeq <4 x float> %a0, %a1 %sext = sext <4 x i1> %cmp to <4 x i32> %res = bitcast <4 x i32> %sext to <4 x float> @@ -156,51 +142,51 @@ define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind { ; https://llvm.org/bugs/show_bug.cgi?id=30512 define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind { -; X32-LABEL: PR30512: -; X32: # %bb.0: -; X32-NEXT: pushl %ebx -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %esi -; X32-NEXT: subl $16, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: xorl %ebx, %ebx -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X32-NEXT: sete %bl -; X32-NEXT: negl %ebx -; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X32-NEXT: xorl %ebx, %ebx -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %esi -; X32-NEXT: sete %bl -; X32-NEXT: negl %ebx -; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X32-NEXT: xorl %ebx, %ebx -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %edx -; X32-NEXT: sete %bl -; X32-NEXT: negl %ebx -; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: sete %dl -; X32-NEXT: negl %edx -; X32-NEXT: movl %edx, (%esp) -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X32-NEXT: andps {{\.LCPI.*}}, %xmm2 -; X32-NEXT: movaps %xmm2, (%eax) -; X32-NEXT: addl $16, %esp -; X32-NEXT: popl %esi -; X32-NEXT: popl %edi -; X32-NEXT: popl %ebx -; X32-NEXT: retl $4 +; X86-LABEL: PR30512: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: sete %bl +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sete %bl +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sete %bl +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sete %dl +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, (%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; X86-NEXT: andps {{\.LCPI.*}}, %xmm2 +; X86-NEXT: movaps %xmm2, (%eax) +; X86-NEXT: addl $16, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl $4 ; ; X64-LABEL: PR30512: ; X64: # %bb.0: @@ -250,10 +236,10 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind { ; scalarizing it anyway. define <2 x float> @PR31672() #0 { -; X32-LABEL: PR31672: -; X32: # %bb.0: -; X32-NEXT: sqrtps {{\.LCPI.*}}, %xmm0 -; X32-NEXT: retl +; X86-LABEL: PR31672: +; X86: # %bb.0: +; X86-NEXT: sqrtps {{\.LCPI.*}}, %xmm0 +; X86-NEXT: retl ; ; X64-LABEL: PR31672: ; X64: # %bb.0: |

