[X86][SSE] Cleanup SSE1 intrinsics tests

Ensure we cover 32/64-bit targets for SSE/AVX/AVX512 cases as necessary llvm-svn: 333833
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2018-06-02 20:25:56 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2018-06-02 20:25:56 +0000
commit: 58ff2ecc4be562b7302309338a5c0bf980eb93f9 (patch)
tree: 2b2cd63e486b5f5f5e818d3fd1a1b5aa30220878
parent: 87908448486b6d076bad6802d82ae544577acdcf (diff)
download: bcm5719-llvm-58ff2ecc4be562b7302309338a5c0bf980eb93f9.tar.gz
bcm5719-llvm-58ff2ecc4be562b7302309338a5c0bf980eb93f9.zip
9 files changed, 3097 insertions, 2091 deletions
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
index 753f787e2d9..78335b6551c 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
@@ -1,33 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
 
 define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
-; X64-LABEL: test_mm_cvtsi64_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cvtsi2ssq %rdi, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cvtsi64_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_mm_cvtsi64_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
 define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
-; X64-LABEL: test_mm_cvtss_si64:
-; X64:       # %bb.0:
-; X64-NEXT:    cvtss2si %xmm0, %rax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cvtss_si64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_mm_cvtss_si64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    retq
   %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
   ret i64 %res
 }
 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
 
 define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
-; X64-LABEL: test_mm_cvttss_si64:
-; X64:       # %bb.0:
-; X64-NEXT:    cvttss2si %xmm0, %rax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cvttss_si64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_mm_cvttss_si64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttss2si %xmm0, %rax
+; AVX-NEXT:    retq
   %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
   ret i64 %res
 }
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index f592ea0b381..d47bf63f9e5 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1,33 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
 
 define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_add_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    addps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_add_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    addps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_add_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_add_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = fadd <4 x float> %a0, %a1
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_add_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    addss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_add_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    addss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_add_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_add_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %ext0 = extractelement <4 x float> %a0, i32 0
   %ext1 = extractelement <4 x float> %a1, i32 0
   %fadd = fadd float %ext0, %ext1
@@ -36,15 +40,15 @@ define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 }
 
 define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_and_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    andps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_and_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    andps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_and_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_and_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   %res = and <4 x i32> %arg0, %arg1
@@ -53,15 +57,15 @@ define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
 }
 
 define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_andnot_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    andnps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_andnot_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    andnps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_andnot_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andnps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_andnot_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vandnps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -71,15 +75,21 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpeq_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpeqps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpeq_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpeqps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpeq_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpeqps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpeq_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpeq_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpeqps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp oeq <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -87,32 +97,37 @@ define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpeq_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpeqss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpeq_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpeq_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpeqss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmpeq_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpge_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpleps %xmm0, %xmm1
-; X32-NEXT:    movaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpge_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpleps %xmm0, %xmm1
-; X64-NEXT:    movaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpge_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpleps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpge_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpge_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpleps %xmm0, %xmm1, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp ole <4 x float> %a1, %a0
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -120,34 +135,45 @@ define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpge_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpless %xmm0, %xmm1
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpge_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpless %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpge_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpless %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpge_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpless %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpge_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpless %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpgt_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpltps %xmm0, %xmm1
-; X32-NEXT:    movaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpgt_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpltps %xmm0, %xmm1
-; X64-NEXT:    movaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpgt_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpltps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpgt_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpgt_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp olt <4 x float> %a1, %a0
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -155,32 +181,44 @@ define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpgt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpltss %xmm0, %xmm1
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpgt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpltss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpgt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpltss %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpgt_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpgt_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmple_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpleps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmple_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpleps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmple_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpleps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmple_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpleps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmple_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpleps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp ole <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -188,29 +226,35 @@ define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmple_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpless %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmple_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpless %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmple_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpless %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmple_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpless %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmplt_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpltps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmplt_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpltps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmplt_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpltps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmplt_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmplt_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpltps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp olt <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -218,29 +262,35 @@ define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmplt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpltss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmplt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpltss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmplt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpltss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmplt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpltss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpneq_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpneqps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpneq_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpneqps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpneq_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpneqps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpneq_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpneqps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpneq_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpneqps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp une <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -248,31 +298,36 @@ define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpneq_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpneqss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpneq_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpneqss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpneq_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpneqss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmpneq_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpneqss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpnge_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnleps %xmm0, %xmm1
-; X32-NEXT:    movaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpnge_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnleps %xmm0, %xmm1
-; X64-NEXT:    movaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpnge_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnleps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpnge_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpnge_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnleps %xmm0, %xmm1, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp ugt <4 x float> %a1, %a0
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -280,34 +335,45 @@ define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpnge_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnless %xmm0, %xmm1
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpnge_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnless %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpnge_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnless %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpnge_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpnge_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpngt_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnltps %xmm0, %xmm1
-; X32-NEXT:    movaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpngt_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnltps %xmm0, %xmm1
-; X64-NEXT:    movaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpngt_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnltps %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpngt_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnltps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpngt_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnltps %xmm0, %xmm1, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp uge <4 x float> %a1, %a0
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -315,32 +381,44 @@ define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpngt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnltss %xmm0, %xmm1
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpngt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnltss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpngt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnltss %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpngt_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpngt_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpnle_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnleps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpnle_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnleps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpnle_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnleps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpnle_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnleps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpnle_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnleps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp ugt <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -348,29 +426,35 @@ define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpnle_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnless %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpnle_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnless %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpnle_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnless %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmpnle_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpnless %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpnlt_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnltps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpnlt_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnltps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpnlt_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnltps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpnlt_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpnltps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpnlt_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpnltps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp uge <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -378,29 +462,35 @@ define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpnlt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpnltss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpnlt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpnltss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpnlt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpnltss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmpnlt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpnltss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpord_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpordps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpord_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpordps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpord_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpordps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpord_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpord_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpordps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp ord <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -408,29 +498,35 @@ define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 }
 
 define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpord_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpordss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpord_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpordss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpord_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpordss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmpord_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpunord_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpunordps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpunord_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpunordps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpunord_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpunordps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_cmpunord_ps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpunordps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_cmpunord_ps:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpunordps %xmm1, %xmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %xmm0
+; AVX512-NEXT:    ret{{[l|q]}}
   %cmp = fcmp uno <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -438,179 +534,203 @@ define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwi
 }
 
 define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_cmpunord_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpunordss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cmpunord_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpunordss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cmpunord_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cmpunordss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cmpunord_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpunordss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
   ret <4 x float> %res
 }
 
 define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_comieq_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    comiss %xmm1, %xmm0
-; X32-NEXT:    setnp %al
-; X32-NEXT:    sete %cl
-; X32-NEXT:    andb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_comieq_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    comiss %xmm1, %xmm0
-; X64-NEXT:    setnp %al
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_comieq_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    comiss %xmm1, %xmm0
+; SSE-NEXT:    setnp %al
+; SSE-NEXT:    sete %cl
+; SSE-NEXT:    andb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_comieq_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcomiss %xmm1, %xmm0
+; AVX-NEXT:    setnp %al
+; AVX-NEXT:    sete %cl
+; AVX-NEXT:    andb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_comige_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    comiss %xmm1, %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_comige_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    comiss %xmm1, %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_comige_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    comiss %xmm1, %xmm0
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_comige_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vcomiss %xmm1, %xmm0
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_comigt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    comiss %xmm1, %xmm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_comigt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    comiss %xmm1, %xmm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_comigt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    comiss %xmm1, %xmm0
+; SSE-NEXT:    seta %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_comigt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vcomiss %xmm1, %xmm0
+; AVX-NEXT:    seta %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_comile_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    comiss %xmm0, %xmm1
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_comile_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    comiss %xmm0, %xmm1
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_comile_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    comiss %xmm0, %xmm1
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_comile_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_comilt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    comiss %xmm0, %xmm1
-; X32-NEXT:    seta %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_comilt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    comiss %xmm0, %xmm1
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_comilt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    comiss %xmm0, %xmm1
+; SSE-NEXT:    seta %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_comilt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vcomiss %xmm0, %xmm1
+; AVX-NEXT:    seta %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_comineq_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    comiss %xmm1, %xmm0
-; X32-NEXT:    setp %al
-; X32-NEXT:    setne %cl
-; X32-NEXT:    orb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_comineq_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    comiss %xmm1, %xmm0
-; X64-NEXT:    setp %al
-; X64-NEXT:    setne %cl
-; X64-NEXT:    orb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_comineq_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    comiss %xmm1, %xmm0
+; SSE-NEXT:    setp %al
+; SSE-NEXT:    setne %cl
+; SSE-NEXT:    orb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_comineq_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcomiss %xmm1, %xmm0
+; AVX-NEXT:    setp %al
+; AVX-NEXT:    setne %cl
+; AVX-NEXT:    orb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_cvt_ss2si:
-; X32:       # %bb.0:
-; X32-NEXT:    cvtss2si %xmm0, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cvt_ss2si:
-; X64:       # %bb.0:
-; X64-NEXT:    cvtss2si %xmm0, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cvt_ss2si:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cvt_ss2si:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtss2si %xmm0, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
-; X32-LABEL: test_mm_cvtsi32_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cvtsi32_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    cvtsi2ssl %edi, %xmm0
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_cvtsi32_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_mm_cvtsi32_ss:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test_mm_cvtsi32_ss:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_cvtsi32_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    cvtsi2ssl %edi, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_cvtsi32_ss:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
   %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
 
 define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_cvtss_f32:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movss %xmm0, (%esp)
-; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
-; X32-NEXT:    retl
+; X86-SSE-LABEL: test_mm_cvtss_f32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movss %xmm0, (%esp)
+; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_cvtss_f32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    flds (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtss_f32:
 ; X64:       # %bb.0:
@@ -620,72 +740,72 @@ define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
 }
 
 define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_cvtss_si32:
-; X32:       # %bb.0:
-; X32-NEXT:    cvtss2si %xmm0, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cvtss_si32:
-; X64:       # %bb.0:
-; X64-NEXT:    cvtss2si %xmm0, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cvtss_si32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cvtss_si32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtss2si %xmm0, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
   ret i32 %res
 }
 
 define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_cvttss_si:
-; X32:       # %bb.0:
-; X32-NEXT:    cvttss2si %xmm0, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cvttss_si:
-; X64:       # %bb.0:
-; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cvttss_si:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttss2si %xmm0, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cvttss_si:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
 
 define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_cvttss_si32:
-; X32:       # %bb.0:
-; X32-NEXT:    cvttss2si %xmm0, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_cvttss_si32:
-; X64:       # %bb.0:
-; X64-NEXT:    cvttss2si %xmm0, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_cvttss_si32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttss2si %xmm0, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_cvttss_si32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvttss2si %xmm0, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
   ret i32 %res
 }
 
 define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_div_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    divps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_div_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    divps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_div_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    divps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_div_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vdivps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = fdiv <4 x float> %a0, %a1
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_div_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    divss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_div_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    divss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_div_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_div_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %ext0 = extractelement <4 x float> %a0, i32 0
   %ext1 = extractelement <4 x float> %a1, i32 0
   %fdiv = fdiv float %ext0, %ext1
@@ -694,23 +814,41 @@ define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 }
 
 define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
-; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl %esp, %eax
-; X32-NEXT:    stmxcsr (%eax)
-; X32-NEXT:    movl (%esp), %eax
-; X32-NEXT:    andl $8064, %eax # imm = 0x1F80
-; X32-NEXT:    popl %ecx
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $8064, %eax # imm = 0x1F80
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl %esp, %eax
+; X86-SSE-NEXT:    stmxcsr (%eax)
+; X86-SSE-NEXT:    movl (%esp), %eax
+; X86-SSE-NEXT:    andl $8064, %eax # imm = 0x1F80
+; X86-SSE-NEXT:    popl %ecx
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl %esp, %eax
+; X86-AVX-NEXT:    vstmxcsr (%eax)
+; X86-AVX-NEXT:    movl (%esp), %eax
+; X86-AVX-NEXT:    andl $8064, %eax # imm = 0x1F80
+; X86-AVX-NEXT:    popl %ecx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    andl $8064, %eax # imm = 0x1F80
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-AVX-NEXT:    andl $8064, %eax # imm = 0x1F80
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -721,23 +859,41 @@ define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
 declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
 
 define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
-; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl %esp, %eax
-; X32-NEXT:    stmxcsr (%eax)
-; X32-NEXT:    movl (%esp), %eax
-; X32-NEXT:    andl $63, %eax
-; X32-NEXT:    popl %ecx
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $63, %eax
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl %esp, %eax
+; X86-SSE-NEXT:    stmxcsr (%eax)
+; X86-SSE-NEXT:    movl (%esp), %eax
+; X86-SSE-NEXT:    andl $63, %eax
+; X86-SSE-NEXT:    popl %ecx
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl %esp, %eax
+; X86-AVX-NEXT:    vstmxcsr (%eax)
+; X86-AVX-NEXT:    movl (%esp), %eax
+; X86-AVX-NEXT:    andl $63, %eax
+; X86-AVX-NEXT:    popl %ecx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    andl $63, %eax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-AVX-NEXT:    andl $63, %eax
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -747,23 +903,41 @@ define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
 }
 
 define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
-; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl %esp, %eax
-; X32-NEXT:    stmxcsr (%eax)
-; X32-NEXT:    movl (%esp), %eax
-; X32-NEXT:    andl $32768, %eax # imm = 0x8000
-; X32-NEXT:    popl %ecx
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $32768, %eax # imm = 0x8000
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl %esp, %eax
+; X86-SSE-NEXT:    stmxcsr (%eax)
+; X86-SSE-NEXT:    movl (%esp), %eax
+; X86-SSE-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-SSE-NEXT:    popl %ecx
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl %esp, %eax
+; X86-AVX-NEXT:    vstmxcsr (%eax)
+; X86-AVX-NEXT:    movl (%esp), %eax
+; X86-AVX-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-AVX-NEXT:    popl %ecx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-AVX-NEXT:    andl $32768, %eax # imm = 0x8000
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -773,23 +947,41 @@ define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
 }
 
 define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
-; X32-LABEL: test_MM_GET_ROUNDING_MODE:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl %esp, %eax
-; X32-NEXT:    stmxcsr (%eax)
-; X32-NEXT:    movl (%esp), %eax
-; X32-NEXT:    andl $24576, %eax # imm = 0x6000
-; X32-NEXT:    popl %ecx
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_GET_ROUNDING_MODE:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $24576, %eax # imm = 0x6000
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl %esp, %eax
+; X86-SSE-NEXT:    stmxcsr (%eax)
+; X86-SSE-NEXT:    movl (%esp), %eax
+; X86-SSE-NEXT:    andl $24576, %eax # imm = 0x6000
+; X86-SSE-NEXT:    popl %ecx
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl %esp, %eax
+; X86-AVX-NEXT:    vstmxcsr (%eax)
+; X86-AVX-NEXT:    movl (%esp), %eax
+; X86-AVX-NEXT:    andl $24576, %eax # imm = 0x6000
+; X86-AVX-NEXT:    popl %ecx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    andl $24576, %eax # imm = 0x6000
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-AVX-NEXT:    andl $24576, %eax # imm = 0x6000
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -799,21 +991,37 @@ define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
 }
 
 define i32 @test_mm_getcsr() nounwind {
-; X32-LABEL: test_mm_getcsr:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl %esp, %eax
-; X32-NEXT:    stmxcsr (%eax)
-; X32-NEXT:    movl (%esp), %eax
-; X32-NEXT:    popl %ecx
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_getcsr:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_getcsr:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl %esp, %eax
+; X86-SSE-NEXT:    stmxcsr (%eax)
+; X86-SSE-NEXT:    movl (%esp), %eax
+; X86-SSE-NEXT:    popl %ecx
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_getcsr:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl %esp, %eax
+; X86-AVX-NEXT:    vstmxcsr (%eax)
+; X86-AVX-NEXT:    movl (%esp), %eax
+; X86-AVX-NEXT:    popl %ecx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_getcsr:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_getcsr:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -822,34 +1030,56 @@ define i32 @test_mm_getcsr() nounwind {
 }
 
 define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
-; X32-LABEL: test_mm_load_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movaps (%eax), %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_load_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movaps (%rdi), %xmm0
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_load_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_load_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_load_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps (%rdi), %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_load_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   %res = load <4 x float>, <4 x float>* %arg0, align 16
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
-; X32-LABEL: test_mm_load_ps1:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_load_ps1:
-; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_load_ps1:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_load_ps1:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vbroadcastss (%eax), %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_load_ps1:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_load_ps1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; X64-AVX-NEXT:    retq
   %ld = load float, float* %a0, align 4
   %res0 = insertelement <4 x float> undef, float %ld, i32 0
   %res1 = insertelement <4 x float> %res0, float %ld, i32 1
@@ -859,16 +1089,27 @@ define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
 }
 
 define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
-; X32-LABEL: test_mm_load_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_load_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_load_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_load_ss:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_load_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_load_ss:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    retq
   %ld = load float, float* %a0, align 1
   %res0 = insertelement <4 x float> undef, float %ld, i32 0
   %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
@@ -878,18 +1119,29 @@ define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
 }
 
 define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
-; X32-LABEL: test_mm_load1_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_load1_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_load1_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_load1_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vbroadcastss (%eax), %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_load1_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_load1_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; X64-AVX-NEXT:    retq
   %ld = load float, float* %a0, align 4
   %res0 = insertelement <4 x float> undef, float %ld, i32 0
   %res1 = insertelement <4 x float> %res0, float %ld, i32 1
@@ -899,26 +1151,38 @@ define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
 }
 
 define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
-; X32-LABEL: test_mm_loadh_pi:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_loadh_pi:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_loadh_pi:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_loadh_pi:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_loadh_pi:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movq (%rdi), %rax
+; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    shrq $32, %rax
+; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_loadh_pi:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-AVX-NEXT:    retq
   %ptr = bitcast x86_mmx* %a1 to <2 x float>*
   %ld  = load <2 x float>, <2 x float>* %ptr
   %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -927,28 +1191,47 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
 }
 
 define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
-; X32-LABEL: test_mm_loadl_pi:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; X32-NEXT:    movaps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_loadl_pi:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; X64-NEXT:    movaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_loadl_pi:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X86-SSE-NEXT:    movaps %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_mm_loadl_pi:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test_mm_loadl_pi:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_loadl_pi:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movq (%rdi), %rax
+; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    shrq $32, %rax
+; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X64-SSE-NEXT:    movaps %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_loadl_pi:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; X64-AVX-NEXT:    retq
   %ptr = bitcast x86_mmx* %a1 to <2 x float>*
   %ld  = load <2 x float>, <2 x float>* %ptr
   %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -957,18 +1240,29 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
 }
 
 define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
-; X32-LABEL: test_mm_loadr_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movaps (%eax), %xmm0
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_loadr_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movaps (%rdi), %xmm0
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_loadr_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_loadr_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_loadr_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps (%rdi), %xmm0
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_loadr_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; X64-AVX-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   %ld = load <4 x float>, <4 x float>* %arg0, align 16
   %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -976,162 +1270,178 @@ define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
 }
 
 define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
-; X32-LABEL: test_mm_loadu_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movups (%eax), %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_loadu_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movups (%rdi), %xmm0
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_loadu_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups (%eax), %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_loadu_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovups (%eax), %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_loadu_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movups (%rdi), %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_loadu_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
+; X64-AVX-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   %res = load <4 x float>, <4 x float>* %arg0, align 1
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_max_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    maxps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_max_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    maxps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_max_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    maxps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_max_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_max_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    maxss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_max_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    maxss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_max_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    maxss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_max_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_min_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    minps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_min_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    minps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_min_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    minps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_min_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_min_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    minss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_min_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    minss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_min_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    minss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_min_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_move_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_move_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_move_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX1-LABEL: test_mm_move_ss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: test_mm_move_ss:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_movehl_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_movehl_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_movehl_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_movehl_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_movelh_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_movelh_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_movelh_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_movelh_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x float> %res
 }
 
 define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
-; X32-LABEL: test_mm_movemask_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movmskps %xmm0, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_movemask_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movmskps %xmm0, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_movemask_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_movemask_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_mul_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    mulps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_mul_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    mulps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_mul_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    mulps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_mul_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = fmul <4 x float> %a0, %a1
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_mul_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    mulss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_mul_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    mulss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_mul_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_mul_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %ext0 = extractelement <4 x float> %a0, i32 0
   %ext1 = extractelement <4 x float> %a1, i32 0
   %fmul = fmul float %ext0, %ext1
@@ -1140,15 +1450,15 @@ define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 }
 
 define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_or_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    orps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_or_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    orps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_or_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_or_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   %res = or <4 x i32> %arg0, %arg1
@@ -1157,11 +1467,11 @@ define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
 }
 
 define void @test_mm_prefetch(i8* %a0) {
-; X32-LABEL: test_mm_prefetch:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    prefetchnta (%eax)
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_prefetch:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    prefetchnta (%eax)
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_prefetch:
 ; X64:       # %bb.0:
@@ -1173,90 +1483,115 @@ define void @test_mm_prefetch(i8* %a0) {
 declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
 
 define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
-; X32-LABEL: test_mm_rcp_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    rcpps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rcp_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    rcpps %xmm0, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_rcp_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpps %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_rcp_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpps %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
-; X32-LABEL: test_mm_rcp_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    rcpss %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rcp_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    rcpss %xmm0, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_rcp_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpss %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_rcp_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
   ret <4 x float> %rcp
 }
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
-; X32-LABEL: test_mm_rsqrt_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    rsqrtps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rsqrt_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    rsqrtps %xmm0, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_rsqrt_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtps %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_rsqrt_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtps %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
-; X32-LABEL: test_mm_rsqrt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    rsqrtss %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_rsqrt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    rsqrtss %xmm0, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_rsqrt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtss %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_rsqrt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
   ret <4 x float> %rsqrt
 }
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
 define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
-; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %esp, %ecx
-; X32-NEXT:    stmxcsr (%ecx)
-; X32-NEXT:    movl (%esp), %edx
-; X32-NEXT:    andl $-8065, %edx # imm = 0xE07F
-; X32-NEXT:    orl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp)
-; X32-NEXT:    ldmxcsr (%ecx)
-; X32-NEXT:    popl %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $-8065, %ecx # imm = 0xE07F
-; X64-NEXT:    orl %edi, %ecx
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr (%rax)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %esp, %ecx
+; X86-SSE-NEXT:    stmxcsr (%ecx)
+; X86-SSE-NEXT:    movl (%esp), %edx
+; X86-SSE-NEXT:    andl $-8065, %edx # imm = 0xE07F
+; X86-SSE-NEXT:    orl %eax, %edx
+; X86-SSE-NEXT:    movl %edx, (%esp)
+; X86-SSE-NEXT:    ldmxcsr (%ecx)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl %esp, %ecx
+; X86-AVX-NEXT:    vstmxcsr (%ecx)
+; X86-AVX-NEXT:    movl (%esp), %edx
+; X86-AVX-NEXT:    andl $-8065, %edx # imm = 0xE07F
+; X86-AVX-NEXT:    orl %eax, %edx
+; X86-AVX-NEXT:    movl %edx, (%esp)
+; X86-AVX-NEXT:    vldmxcsr (%ecx)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE-NEXT:    andl $-8065, %ecx # imm = 0xE07F
+; X64-SSE-NEXT:    orl %edi, %ecx
+; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    ldmxcsr (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-AVX-NEXT:    andl $-8065, %ecx # imm = 0xE07F
+; X64-AVX-NEXT:    orl %edi, %ecx
+; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vldmxcsr (%rax)
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -1270,30 +1605,55 @@ define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
 declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
 
 define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
-; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %esp, %ecx
-; X32-NEXT:    stmxcsr (%ecx)
-; X32-NEXT:    movl (%esp), %edx
-; X32-NEXT:    andl $-64, %edx
-; X32-NEXT:    orl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp)
-; X32-NEXT:    ldmxcsr (%ecx)
-; X32-NEXT:    popl %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $-64, %ecx
-; X64-NEXT:    orl %edi, %ecx
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr (%rax)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %esp, %ecx
+; X86-SSE-NEXT:    stmxcsr (%ecx)
+; X86-SSE-NEXT:    movl (%esp), %edx
+; X86-SSE-NEXT:    andl $-64, %edx
+; X86-SSE-NEXT:    orl %eax, %edx
+; X86-SSE-NEXT:    movl %edx, (%esp)
+; X86-SSE-NEXT:    ldmxcsr (%ecx)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl %esp, %ecx
+; X86-AVX-NEXT:    vstmxcsr (%ecx)
+; X86-AVX-NEXT:    movl (%esp), %edx
+; X86-AVX-NEXT:    andl $-64, %edx
+; X86-AVX-NEXT:    orl %eax, %edx
+; X86-AVX-NEXT:    movl %edx, (%esp)
+; X86-AVX-NEXT:    vldmxcsr (%ecx)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE-NEXT:    andl $-64, %ecx
+; X64-SSE-NEXT:    orl %edi, %ecx
+; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    ldmxcsr (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-AVX-NEXT:    andl $-64, %ecx
+; X64-AVX-NEXT:    orl %edi, %ecx
+; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vldmxcsr (%rax)
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -1306,30 +1666,55 @@ define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
 }
 
 define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
-; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %esp, %ecx
-; X32-NEXT:    stmxcsr (%ecx)
-; X32-NEXT:    movl (%esp), %edx
-; X32-NEXT:    andl $-32769, %edx # imm = 0xFFFF7FFF
-; X32-NEXT:    orl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp)
-; X32-NEXT:    ldmxcsr (%ecx)
-; X32-NEXT:    popl %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $-32769, %ecx # imm = 0xFFFF7FFF
-; X64-NEXT:    orl %edi, %ecx
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr (%rax)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %esp, %ecx
+; X86-SSE-NEXT:    stmxcsr (%ecx)
+; X86-SSE-NEXT:    movl (%esp), %edx
+; X86-SSE-NEXT:    andl $-32769, %edx # imm = 0xFFFF7FFF
+; X86-SSE-NEXT:    orl %eax, %edx
+; X86-SSE-NEXT:    movl %edx, (%esp)
+; X86-SSE-NEXT:    ldmxcsr (%ecx)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl %esp, %ecx
+; X86-AVX-NEXT:    vstmxcsr (%ecx)
+; X86-AVX-NEXT:    movl (%esp), %edx
+; X86-AVX-NEXT:    andl $-32769, %edx # imm = 0xFFFF7FFF
+; X86-AVX-NEXT:    orl %eax, %edx
+; X86-AVX-NEXT:    movl %edx, (%esp)
+; X86-AVX-NEXT:    vldmxcsr (%ecx)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE-NEXT:    andl $-32769, %ecx # imm = 0xFFFF7FFF
+; X64-SSE-NEXT:    orl %edi, %ecx
+; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    ldmxcsr (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-AVX-NEXT:    andl $-32769, %ecx # imm = 0xFFFF7FFF
+; X64-AVX-NEXT:    orl %edi, %ecx
+; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vldmxcsr (%rax)
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -1342,24 +1727,42 @@ define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
 }
 
 define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
-; X32-LABEL: test_mm_set_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_set_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X64-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; X64-NEXT:    movaps %xmm3, %xmm0
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_set_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_set_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_set_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; X64-SSE-NEXT:    movaps %xmm3, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_set_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; X64-AVX-NEXT:    retq
   %res0  = insertelement <4 x float> undef, float %a3, i32 0
   %res1  = insertelement <4 x float> %res0, float %a2, i32 1
   %res2  = insertelement <4 x float> %res1, float %a1, i32 2
@@ -1368,16 +1771,38 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
 }
 
 define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
-; X32-LABEL: test_mm_set_ps1:
-; X32:       # %bb.0:
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_set_ps1:
-; X64:       # %bb.0:
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_set_ps1:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_mm_set_ps1:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test_mm_set_ps1:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_set_ps1:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_mm_set_ps1:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_mm_set_ps1:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   %res1  = insertelement <4 x float> %res0, float %a0, i32 1
   %res2  = insertelement <4 x float> %res1, float %a0, i32 2
@@ -1386,30 +1811,55 @@ define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
 }
 
 define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
-; X32-LABEL: test_MM_SET_ROUNDING_MODE:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %esp, %ecx
-; X32-NEXT:    stmxcsr (%ecx)
-; X32-NEXT:    movl (%esp), %edx
-; X32-NEXT:    andl $-24577, %edx # imm = 0x9FFF
-; X32-NEXT:    orl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp)
-; X32-NEXT:    ldmxcsr (%ecx)
-; X32-NEXT:    popl %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_SET_ROUNDING_MODE:
-; X64:       # %bb.0:
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    stmxcsr (%rax)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $-24577, %ecx # imm = 0x9FFF
-; X64-NEXT:    orl %edi, %ecx
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr (%rax)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %esp, %ecx
+; X86-SSE-NEXT:    stmxcsr (%ecx)
+; X86-SSE-NEXT:    movl (%esp), %edx
+; X86-SSE-NEXT:    andl $-24577, %edx # imm = 0x9FFF
+; X86-SSE-NEXT:    orl %eax, %edx
+; X86-SSE-NEXT:    movl %edx, (%esp)
+; X86-SSE-NEXT:    ldmxcsr (%ecx)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl %esp, %ecx
+; X86-AVX-NEXT:    vstmxcsr (%ecx)
+; X86-AVX-NEXT:    movl (%esp), %edx
+; X86-AVX-NEXT:    andl $-24577, %edx # imm = 0x9FFF
+; X86-AVX-NEXT:    orl %eax, %edx
+; X86-AVX-NEXT:    movl %edx, (%esp)
+; X86-AVX-NEXT:    vldmxcsr (%ecx)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    stmxcsr (%rax)
+; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE-NEXT:    andl $-24577, %ecx # imm = 0x9FFF
+; X64-SSE-NEXT:    orl %edi, %ecx
+; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    ldmxcsr (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vstmxcsr (%rax)
+; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
+; X64-AVX-NEXT:    andl $-24577, %ecx # imm = 0x9FFF
+; X64-AVX-NEXT:    orl %edi, %ecx
+; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vldmxcsr (%rax)
+; X64-AVX-NEXT:    retq
   %1 = alloca i32, align 4
   %2 = bitcast i32* %1 to i8*
   call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -1422,19 +1872,45 @@ define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
 }
 
 define <4 x float> @test_mm_set_ss(float %a0) nounwind {
-; X32-LABEL: test_mm_set_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_set_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm1, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X64-NEXT:    movaps %xmm1, %xmm0
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_set_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_mm_set_ss:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test_mm_set_ss:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_set_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    xorps %xmm1, %xmm1
+; X64-SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-SSE-NEXT:    movaps %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_mm_set_ss:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_mm_set_ss:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-AVX512-NEXT:    retq
   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
   %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
@@ -1443,16 +1919,38 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 }
 
 define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
-; X32-LABEL: test_mm_set1_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_set1_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_set1_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_mm_set1_ps:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test_mm_set1_ps:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_set1_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_mm_set1_ps:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_mm_set1_ps:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   %res1  = insertelement <4 x float> %res0, float %a0, i32 1
   %res2  = insertelement <4 x float> %res1, float %a0, i32 2
@@ -1461,18 +1959,31 @@ define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
 }
 
 define void @test_mm_setcsr(i32 %a0) nounwind {
-; X32-LABEL: test_mm_setcsr:
-; X32:       # %bb.0:
-; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    ldmxcsr (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_setcsr:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    ldmxcsr (%rax)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_setcsr:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    ldmxcsr (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_setcsr:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vldmxcsr (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_setcsr:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    ldmxcsr (%rax)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_setcsr:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    vldmxcsr (%rax)
+; X64-AVX-NEXT:    retq
   %st = alloca i32, align 4
   store i32 %a0, i32* %st, align 4
   %bc = bitcast i32* %st to i8*
@@ -1481,23 +1992,41 @@ define void @test_mm_setcsr(i32 %a0) nounwind {
 }
 
 define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
-; X32-LABEL: test_mm_setr_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_setr_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_setr_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_setr_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_setr_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_setr_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; X64-AVX-NEXT:    retq
   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   %res1  = insertelement <4 x float> %res0, float %a1, i32 1
   %res2  = insertelement <4 x float> %res1, float %a2, i32 2
@@ -1506,106 +2035,138 @@ define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3)
 }
 
 define <4 x float> @test_mm_setzero_ps() {
-; X32-LABEL: test_mm_setzero_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_setzero_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_setzero_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_setzero_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   ret <4 x float> zeroinitializer
 }
 
 define void @test_mm_sfence() nounwind {
-; X32-LABEL: test_mm_sfence:
-; X32:       # %bb.0:
-; X32-NEXT:    sfence
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sfence:
-; X64:       # %bb.0:
-; X64-NEXT:    sfence
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_sfence:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sfence
+; CHECK-NEXT:    ret{{[l|q]}}
   call void @llvm.x86.sse.sfence()
   ret void
 }
 declare void @llvm.x86.sse.sfence() nounwind readnone
 
 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_shuffle_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_shuffle_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_shuffle_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_shuffle_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
-; X32-LABEL: test_mm_sqrt_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    sqrtps %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sqrt_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    sqrtps %xmm0, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_sqrt_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_sqrt_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtps %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
-; X32-LABEL: test_mm_sqrt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    sqrtss %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sqrt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    sqrtss %xmm0, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_sqrt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_sqrt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
   ret <4 x float> %sqrt
 }
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 
 define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_store_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movaps %xmm0, (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_store_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movaps %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_store_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_store_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_store_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_store_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   store <4 x float> %a1, <4 x float>* %arg0, align 16
   ret void
 }
 
 define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_store_ps1:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT:    movaps %xmm0, (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_store_ps1:
-; X64:       # %bb.0:
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT:    movaps %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_store_ps1:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_mm_store_ps1:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test_mm_store_ps1:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_store_ps1:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_mm_store_ps1:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_mm_store_ps1:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
   store <4 x float> %shuf, <4 x float>* %arg0, align 16
@@ -1613,34 +2174,71 @@ define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
 }
 
 define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_store_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movss %xmm0, (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_store_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    movss %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_store_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_store_ss:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_store_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movss %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_store_ss:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovss %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %ext = extractelement <4 x float> %a1, i32 0
   store float %ext, float* %a0, align 1
   ret void
 }
 
 define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_store1_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-NEXT:    movaps %xmm0, (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_store1_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-NEXT:    movaps %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_store1_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_mm_store1_ps:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test_mm_store1_ps:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_store1_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_mm_store1_ps:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_mm_store1_ps:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
   store <4 x float> %shuf, <4 x float>* %arg0, align 16
@@ -1648,28 +2246,40 @@ define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
 }
 
 define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_storeh_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movaps %xmm0, (%esp)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, 4(%eax)
-; X32-NEXT:    movl %ecx, (%eax)
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_storeh_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_storeh_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %ebp
+; X86-SSE-NEXT:    movl %esp, %ebp
+; X86-SSE-NEXT:    andl $-16, %esp
+; X86-SSE-NEXT:    subl $32, %esp
+; X86-SSE-NEXT:    movl 8(%ebp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl %edx, 4(%eax)
+; X86-SSE-NEXT:    movl %ecx, (%eax)
+; X86-SSE-NEXT:    movl %ebp, %esp
+; X86-SSE-NEXT:    popl %ebp
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_storeh_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovhpd %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_storeh_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    movq %rax, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_storeh_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX-NEXT:    movq %rax, (%rdi)
+; X64-AVX-NEXT:    retq
   %ptr = bitcast x86_mmx* %a0 to i64*
   %bc  = bitcast <4 x float> %a1 to <2 x i64>
   %ext = extractelement <2 x i64> %bc, i32 1
@@ -1678,28 +2288,40 @@ define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
 }
 
 define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_storel_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
-; X32-NEXT:    movaps %xmm0, (%esp)
-; X32-NEXT:    movl (%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, 4(%eax)
-; X32-NEXT:    movl %ecx, (%eax)
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_storel_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_storel_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %ebp
+; X86-SSE-NEXT:    movl %esp, %ebp
+; X86-SSE-NEXT:    andl $-16, %esp
+; X86-SSE-NEXT:    subl $32, %esp
+; X86-SSE-NEXT:    movl 8(%ebp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE-NEXT:    movl (%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl %edx, 4(%eax)
+; X86-SSE-NEXT:    movl %ecx, (%eax)
+; X86-SSE-NEXT:    movl %ebp, %esp
+; X86-SSE-NEXT:    popl %ebp
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_storel_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_storel_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    movq %rax, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_storel_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovq %xmm0, %rax
+; X64-AVX-NEXT:    movq %rax, (%rdi)
+; X64-AVX-NEXT:    retq
   %ptr = bitcast x86_mmx* %a0 to i64*
   %bc  = bitcast <4 x float> %a1 to <2 x i64>
   %ext = extractelement <2 x i64> %bc, i32 0
@@ -1708,18 +2330,31 @@ define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
 }
 
 define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_storer_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; X32-NEXT:    movaps %xmm0, (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_storer_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; X64-NEXT:    movaps %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_storer_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_storer_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_storer_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_storer_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   store <4 x float> %shuf, <4 x float>* %arg0, align 16
@@ -1727,61 +2362,83 @@ define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
 }
 
 define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_storeu_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movups %xmm0, (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_storeu_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movups %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_storeu_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_storeu_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_storeu_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movups %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_storeu_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   store <4 x float> %a1, <4 x float>* %arg0, align 1
   ret void
 }
 
 define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_stream_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movntps %xmm0, (%eax)
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_stream_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movntps %xmm0, (%rdi)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_mm_stream_ps:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movntps %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_stream_ps:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovntps %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_stream_ps:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movntps %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_stream_ps:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %arg0 = bitcast float* %a0 to <4 x float>*
   store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
   ret void
 }
 
 define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_sub_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    subps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sub_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    subps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_sub_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_sub_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = fsub <4 x float> %a0, %a1
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_sub_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    subss %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_sub_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    subss %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_sub_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_sub_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %ext0 = extractelement <4 x float> %a0, i32 0
   %ext1 = extractelement <4 x float> %a1, i32 0
   %fsub = fsub float %ext0, %ext1
@@ -1790,59 +2447,105 @@ define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 }
 
 define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
-; X32-LABEL: test_MM_TRANSPOSE4_PS:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movaps (%esi), %xmm0
-; X32-NEXT:    movaps (%edx), %xmm1
-; X32-NEXT:    movaps (%ecx), %xmm2
-; X32-NEXT:    movaps (%eax), %xmm3
-; X32-NEXT:    movaps %xmm0, %xmm4
-; X32-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; X32-NEXT:    movaps %xmm2, %xmm5
-; X32-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X32-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X32-NEXT:    movaps %xmm4, %xmm1
-; X32-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; X32-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
-; X32-NEXT:    movaps %xmm0, %xmm3
-; X32-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; X32-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
-; X32-NEXT:    movaps %xmm1, (%esi)
-; X32-NEXT:    movaps %xmm5, (%edx)
-; X32-NEXT:    movaps %xmm3, (%ecx)
-; X32-NEXT:    movaps %xmm2, (%eax)
-; X32-NEXT:    popl %esi
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_MM_TRANSPOSE4_PS:
-; X64:       # %bb.0:
-; X64-NEXT:    movaps (%rdi), %xmm0
-; X64-NEXT:    movaps (%rsi), %xmm1
-; X64-NEXT:    movaps (%rdx), %xmm2
-; X64-NEXT:    movaps (%rcx), %xmm3
-; X64-NEXT:    movaps %xmm0, %xmm4
-; X64-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; X64-NEXT:    movaps %xmm2, %xmm5
-; X64-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X64-NEXT:    movaps %xmm4, %xmm1
-; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; X64-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
-; X64-NEXT:    movaps %xmm0, %xmm3
-; X64-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; X64-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
-; X64-NEXT:    movaps %xmm1, (%rdi)
-; X64-NEXT:    movaps %xmm5, (%rsi)
-; X64-NEXT:    movaps %xmm3, (%rdx)
-; X64-NEXT:    movaps %xmm2, (%rcx)
-; X64-NEXT:    retq
+; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movaps (%esi), %xmm0
+; X86-SSE-NEXT:    movaps (%edx), %xmm1
+; X86-SSE-NEXT:    movaps (%ecx), %xmm2
+; X86-SSE-NEXT:    movaps (%eax), %xmm3
+; X86-SSE-NEXT:    movaps %xmm0, %xmm4
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X86-SSE-NEXT:    movaps %xmm2, %xmm5
+; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X86-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-SSE-NEXT:    movaps %xmm4, %xmm1
+; X86-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X86-SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X86-SSE-NEXT:    movaps %xmm0, %xmm3
+; X86-SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X86-SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X86-SSE-NEXT:    movaps %xmm1, (%esi)
+; X86-SSE-NEXT:    movaps %xmm5, (%edx)
+; X86-SSE-NEXT:    movaps %xmm3, (%ecx)
+; X86-SSE-NEXT:    movaps %xmm2, (%eax)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_MM_TRANSPOSE4_PS:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %esi
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-AVX-NEXT:    vmovaps (%esi), %xmm0
+; X86-AVX-NEXT:    vmovaps (%edx), %xmm1
+; X86-AVX-NEXT:    vmovaps (%ecx), %xmm2
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm3
+; X86-AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-AVX-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm5[0]
+; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1]
+; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
+; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X86-AVX-NEXT:    vmovaps %xmm2, (%esi)
+; X86-AVX-NEXT:    vmovaps %xmm3, (%edx)
+; X86-AVX-NEXT:    vmovaps %xmm4, (%ecx)
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_MM_TRANSPOSE4_PS:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps (%rdi), %xmm0
+; X64-SSE-NEXT:    movaps (%rsi), %xmm1
+; X64-SSE-NEXT:    movaps (%rdx), %xmm2
+; X64-SSE-NEXT:    movaps (%rcx), %xmm3
+; X64-SSE-NEXT:    movaps %xmm0, %xmm4
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X64-SSE-NEXT:    movaps %xmm2, %xmm5
+; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X64-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-SSE-NEXT:    movaps %xmm4, %xmm1
+; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X64-SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X64-SSE-NEXT:    movaps %xmm0, %xmm3
+; X64-SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X64-SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X64-SSE-NEXT:    movaps %xmm1, (%rdi)
+; X64-SSE-NEXT:    movaps %xmm5, (%rsi)
+; X64-SSE-NEXT:    movaps %xmm3, (%rdx)
+; X64-SSE-NEXT:    movaps %xmm2, (%rcx)
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_MM_TRANSPOSE4_PS:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovaps (%rsi), %xmm1
+; X64-AVX-NEXT:    vmovaps (%rdx), %xmm2
+; X64-AVX-NEXT:    vmovaps (%rcx), %xmm3
+; X64-AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-AVX-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X64-AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm5[0]
+; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1]
+; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0]
+; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-AVX-NEXT:    vmovaps %xmm2, (%rdi)
+; X64-AVX-NEXT:    vmovaps %xmm3, (%rsi)
+; X64-AVX-NEXT:    vmovaps %xmm4, (%rdx)
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rcx)
+; X64-AVX-NEXT:    retq
   %row0 = load <4 x float>, <4 x float>* %a0, align 16
   %row1 = load <4 x float>, <4 x float>* %a1, align 16
   %row2 = load <4 x float>, <4 x float>* %a2, align 16
@@ -1863,176 +2566,172 @@ define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x floa
 }
 
 define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_ucomieq_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    ucomiss %xmm1, %xmm0
-; X32-NEXT:    setnp %al
-; X32-NEXT:    sete %cl
-; X32-NEXT:    andb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_ucomieq_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    setnp %al
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_ucomieq_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    setnp %al
+; SSE-NEXT:    sete %cl
+; SSE-NEXT:    andb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_ucomieq_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vucomiss %xmm1, %xmm0
+; AVX-NEXT:    setnp %al
+; AVX-NEXT:    sete %cl
+; AVX-NEXT:    andb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_ucomige_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    ucomiss %xmm1, %xmm0
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_ucomige_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_ucomige_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_ucomige_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vucomiss %xmm1, %xmm0
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_ucomigt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    ucomiss %xmm1, %xmm0
-; X32-NEXT:    seta %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_ucomigt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_ucomigt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    seta %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_ucomigt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vucomiss %xmm1, %xmm0
+; AVX-NEXT:    seta %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_ucomile_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    ucomiss %xmm0, %xmm1
-; X32-NEXT:    setae %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_ucomile_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    ucomiss %xmm0, %xmm1
-; X64-NEXT:    setae %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_ucomile_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ucomiss %xmm0, %xmm1
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_ucomile_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_ucomilt_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    ucomiss %xmm0, %xmm1
-; X32-NEXT:    seta %al
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_ucomilt_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    ucomiss %xmm0, %xmm1
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_ucomilt_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ucomiss %xmm0, %xmm1
+; SSE-NEXT:    seta %al
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_ucomilt_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vucomiss %xmm0, %xmm1
+; AVX-NEXT:    seta %al
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_ucomineq_ss:
-; X32:       # %bb.0:
-; X32-NEXT:    ucomiss %xmm1, %xmm0
-; X32-NEXT:    setp %al
-; X32-NEXT:    setne %cl
-; X32-NEXT:    orb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_ucomineq_ss:
-; X64:       # %bb.0:
-; X64-NEXT:    ucomiss %xmm1, %xmm0
-; X64-NEXT:    setp %al
-; X64-NEXT:    setne %cl
-; X64-NEXT:    orb %al, %cl
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_ucomineq_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    setp %al
+; SSE-NEXT:    setne %cl
+; SSE-NEXT:    orb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_ucomineq_ss:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vucomiss %xmm1, %xmm0
+; AVX-NEXT:    setp %al
+; AVX-NEXT:    setne %cl
+; AVX-NEXT:    orb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_undefined_ps() {
-; X32-LABEL: test_mm_undefined_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_undefined_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_undefined_ps:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
   ret <4 x float> undef
 }
 
 define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_unpackhi_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_unpackhi_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_unpackhi_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_unpackhi_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_unpacklo_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_unpacklo_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_unpacklo_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_unpacklo_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: test_mm_xor_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    xorps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_xor_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    xorps %xmm1, %xmm0
-; X64-NEXT:    retq
+; SSE-LABEL: test_mm_xor_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
+;
+; AVX-LABEL: test_mm_xor_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   %res = xor <4 x i32> %arg0, %arg1
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
index 47c3c0b2261..60a455ae148 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -1,12 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_storeu_ps:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movups %xmm0, (%eax)
-; CHECK-NEXT:    retl
+; X86-SSE-LABEL: test_x86_sse_storeu_ps:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    movups %xmm0, (%eax) ## encoding: [0x0f,0x11,0x00]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: test_x86_sse_storeu_ps:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: test_x86_sse_storeu_ps:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse_storeu_ps:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    movups %xmm0, (%rdi) ## encoding: [0x0f,0x11,0x07]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: test_x86_sse_storeu_ps:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_x86_sse_storeu_ps:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
   ret void
 }
@@ -14,10 +46,20 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
 
 
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_add_ss:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    addss %xmm1, %xmm0
-; CHECK-NEXT:    retl
+; SSE-LABEL: test_x86_sse_add_ss:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_add_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_add_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -25,10 +67,20 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
 
 
 define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_sub_ss:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subss %xmm1, %xmm0
-; CHECK-NEXT:    retl
+; SSE-LABEL: test_x86_sse_sub_ss:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    subss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5c,0xc1]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_sub_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5c,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_sub_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5c,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -36,10 +88,20 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
 
 
 define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_mul_ss:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    mulss %xmm1, %xmm0
-; CHECK-NEXT:    retl
+; SSE-LABEL: test_x86_sse_mul_ss:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    mulss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x59,0xc1]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_mul_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x59,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_mul_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -47,10 +109,20 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
 
 
 define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_div_ss:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    divss %xmm1, %xmm0
-; CHECK-NEXT:    retl
+; SSE-LABEL: test_x86_sse_div_ss:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    divss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5e,0xc1]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_div_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5e,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_div_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vdivss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5e,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -58,10 +130,35 @@ declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind read
 
 
 define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0, i32 %a1) {
-; CHECK-LABEL: test_x86_sse_cvtsi2ss:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT:    retl
+; X86-SSE-LABEL: test_x86_sse_cvtsi2ss:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xf3,0x0f,0x2a,0x44,0x24,0x04]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: test_x86_sse_cvtsi2ss:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: test_x86_sse_cvtsi2ss:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse_cvtsi2ss:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    cvtsi2ssl %edi, %xmm0 ## encoding: [0xf3,0x0f,0x2a,0xc7]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: test_x86_sse_cvtsi2ss:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x2a,0xc7]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_x86_sse_cvtsi2ss:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc7]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
index 3eb64698905..0014da6b2ec 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -1,18 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_cmp_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    cmpordps %xmm1, %xmm0 ## encoding: [0x0f,0xc2,0xc1,0x07]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_cmp_ps:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse_cmp_ps:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -23,12 +26,12 @@ define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_cmp_ss:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    cmpordss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0xc2,0xc1,0x07]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_cmp_ss:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse_cmp_ss:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -43,25 +46,25 @@ define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
 ; SSE-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
 ; SSE-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_comieq_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX2-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX2-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX2-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX2-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_comieq_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; SKX-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; SKX-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; SKX-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; SKX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_comieq_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX1-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
+; AVX1-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
+; AVX1-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
+; AVX1-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_comieq_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX512-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
+; AVX512-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
+; AVX512-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
+; AVX512-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -74,21 +77,21 @@ define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1]
 ; SSE-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_comige_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX2-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_comige_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; SKX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_comige_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX1-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_comige_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX512-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -101,21 +104,21 @@ define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1]
 ; SSE-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_comigt_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX2-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_comigt_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; SKX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_comigt_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX1-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_comigt_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX512-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -128,21 +131,21 @@ define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    comiss %xmm0, %xmm1 ## encoding: [0x0f,0x2f,0xc8]
 ; SSE-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_comile_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
-; AVX2-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_comile_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
-; SKX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_comile_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
+; AVX1-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_comile_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
+; AVX512-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -155,21 +158,21 @@ define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    comiss %xmm0, %xmm1 ## encoding: [0x0f,0x2f,0xc8]
 ; SSE-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_comilt_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
-; AVX2-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_comilt_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
-; SKX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_comilt_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
+; AVX1-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_comilt_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
+; AVX512-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -184,25 +187,25 @@ define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
 ; SSE-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
 ; SSE-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_comineq_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX2-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX2-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX2-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX2-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_comineq_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; SKX-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; SKX-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; SKX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; SKX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_comineq_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX1-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
+; AVX1-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
+; AVX1-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
+; AVX1-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_comineq_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
+; AVX512-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
+; AVX512-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
+; AVX512-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
+; AVX512-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -213,17 +216,17 @@ define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_cvtss2si:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    cvtss2si %xmm0, %eax ## encoding: [0xf3,0x0f,0x2d,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_cvtss2si:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vcvtss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2d,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_cvtss2si:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vcvtss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2d,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_cvtss2si:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vcvtss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_cvtss2si:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vcvtss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -234,17 +237,17 @@ define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_cvttss2si:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    cvttss2si %xmm0, %eax ## encoding: [0xf3,0x0f,0x2c,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_cvttss2si:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vcvttss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2c,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_cvttss2si:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vcvttss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2c,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_cvttss2si:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vcvttss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_cvttss2si:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -252,17 +255,27 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
 
 
 define void @test_x86_sse_ldmxcsr(i8* %a0) {
-; SSE-LABEL: test_x86_sse_ldmxcsr:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SSE-NEXT:    ldmxcsr (%eax) ## encoding: [0x0f,0xae,0x10]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; VCHECK-LABEL: test_x86_sse_ldmxcsr:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; VCHECK-NEXT:    vldmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x10]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; X86-SSE-LABEL: test_x86_sse_ldmxcsr:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    ldmxcsr (%eax) ## encoding: [0x0f,0xae,0x10]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX-LABEL: test_x86_sse_ldmxcsr:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vldmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x10]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse_ldmxcsr:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    ldmxcsr (%rdi) ## encoding: [0x0f,0xae,0x17]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_sse_ldmxcsr:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vldmxcsr (%rdi) ## encoding: [0xc5,0xf8,0xae,0x17]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.sse.ldmxcsr(i8* %a0)
   ret void
 }
@@ -274,17 +287,17 @@ define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_max_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_max_ps:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_max_ps:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_max_ps:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_max_ps:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -295,17 +308,17 @@ define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_max_ss:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    maxss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5f,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_max_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_max_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_max_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_max_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -316,17 +329,17 @@ define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_min_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    minps %xmm1, %xmm0 ## encoding: [0x0f,0x5d,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_min_ps:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5d,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_min_ps:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5d,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_min_ps:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_min_ps:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -337,17 +350,17 @@ define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse_min_ss:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    minss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5d,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_min_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_min_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_min_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_min_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -358,12 +371,12 @@ define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_movmsk_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    movmskps %xmm0, %eax ## encoding: [0x0f,0x50,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_movmsk_ps:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vmovmskps %xmm0, %eax ## encoding: [0xc5,0xf8,0x50,0xc0]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse_movmsk_ps:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vmovmskps %xmm0, %eax ## encoding: [0xc5,0xf8,0x50,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -375,12 +388,12 @@ define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_rcp_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_rcp_ps:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse_rcp_ps:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -391,12 +404,12 @@ define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_rcp_ss:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    rcpss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x53,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_rcp_ss:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x53,0xc0]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse_rcp_ss:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x53,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -407,12 +420,12 @@ define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_rsqrt_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_rsqrt_ps:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse_rsqrt_ps:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -423,12 +436,12 @@ define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_rsqrt_ss:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    rsqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x52,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_rsqrt_ss:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x52,0xc0]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse_rsqrt_ss:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x52,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -439,17 +452,17 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_sqrt_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_sqrt_ps:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_sqrt_ps:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_sqrt_ps:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_sqrt_ps:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -460,17 +473,17 @@ define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_sqrt_ss:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_sqrt_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_sqrt_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_sqrt_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_sqrt_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -478,17 +491,27 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 
 
 define void @test_x86_sse_stmxcsr(i8* %a0) {
-; SSE-LABEL: test_x86_sse_stmxcsr:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SSE-NEXT:    stmxcsr (%eax) ## encoding: [0x0f,0xae,0x18]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; VCHECK-LABEL: test_x86_sse_stmxcsr:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; VCHECK-NEXT:    vstmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x18]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; X86-SSE-LABEL: test_x86_sse_stmxcsr:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    stmxcsr (%eax) ## encoding: [0x0f,0xae,0x18]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX-LABEL: test_x86_sse_stmxcsr:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vstmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x18]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse_stmxcsr:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    stmxcsr (%rdi) ## encoding: [0x0f,0xae,0x1f]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_sse_stmxcsr:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vstmxcsr (%rdi) ## encoding: [0xc5,0xf8,0xae,0x1f]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.sse.stmxcsr(i8* %a0)
   ret void
 }
@@ -503,25 +526,25 @@ define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
 ; SSE-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
 ; SSE-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_ucomieq_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX2-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX2-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX2-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX2-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_ucomieq_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; SKX-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; SKX-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; SKX-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; SKX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_ucomieq_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX1-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
+; AVX1-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
+; AVX1-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
+; AVX1-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_ucomieq_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX512-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
+; AVX512-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
+; AVX512-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
+; AVX512-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -534,21 +557,21 @@ define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1]
 ; SSE-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_ucomige_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX2-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_ucomige_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; SKX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_ucomige_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX1-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_ucomige_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX512-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -561,21 +584,21 @@ define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1]
 ; SSE-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_ucomigt_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX2-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_ucomigt_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; SKX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_ucomigt_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX1-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_ucomigt_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX512-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -588,21 +611,21 @@ define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    ucomiss %xmm0, %xmm1 ## encoding: [0x0f,0x2e,0xc8]
 ; SSE-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_ucomile_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
-; AVX2-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_ucomile_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
-; SKX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_ucomile_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
+; AVX1-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_ucomile_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
+; AVX512-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -615,21 +638,21 @@ define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE-NEXT:    ucomiss %xmm0, %xmm1 ## encoding: [0x0f,0x2e,0xc8]
 ; SSE-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_ucomilt_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX2-NEXT:    vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
-; AVX2-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_ucomilt_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; SKX-NEXT:    vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
-; SKX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_ucomilt_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX1-NEXT:    vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
+; AVX1-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_ucomilt_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; AVX512-NEXT:    vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
+; AVX512-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -644,25 +667,25 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
 ; SSE-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
 ; SSE-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_ucomineq_ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX2-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX2-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX2-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX2-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_ucomineq_ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; SKX-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; SKX-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; SKX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; SKX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_ucomineq_ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX1-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
+; AVX1-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
+; AVX1-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
+; AVX1-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_ucomineq_ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
+; AVX512-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
+; AVX512-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
+; AVX512-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
+; AVX512-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -670,15 +693,10 @@ declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnon
 
 
 define void @sfence() nounwind {
-; SSE-LABEL: sfence:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    sfence ## encoding: [0x0f,0xae,0xf8]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; VCHECK-LABEL: sfence:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    sfence ## encoding: [0x0f,0xae,0xf8]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: sfence:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    sfence ## encoding: [0x0f,0xae,0xf8]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   tail call void @llvm.x86.sse.sfence()
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll
index 1ce5bbf94dd..3de61c5e55d 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64-upgrade.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
 ; SSE-LABEL: test_x86_sse_cvtsi642ss:
@@ -9,15 +9,15 @@ define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
 ; SSE-NEXT:    cvtsi2ssq %rdi, %xmm0 ## encoding: [0xf3,0x48,0x0f,0x2a,0xc7]
 ; SSE-NEXT:    retq ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_cvtsi642ss:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
-; AVX2-NEXT:    retq ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_cvtsi642ss:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
+; AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_cvtsi642ss:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
-; SKX-NEXT:    retq ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_cvtsi642ss:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll
index 161047ccfe9..6851abc286a 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86_64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse2 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_cvtss2si64:
@@ -9,15 +9,15 @@ define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
 ; SSE-NEXT:    cvtss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2d,0xc0]
 ; SSE-NEXT:    retq ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_cvtss2si64:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vcvtss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
-; AVX2-NEXT:    retq ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_cvtss2si64:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
+; AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_cvtss2si64:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vcvtss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
-; SKX-NEXT:    retq ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_cvtss2si64:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
   %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
@@ -30,15 +30,15 @@ define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
 ; SSE-NEXT:    cvttss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2c,0xc0]
 ; SSE-NEXT:    retq ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_sse_cvttss2si64:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vcvttss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
-; AVX2-NEXT:    retq ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_sse_cvttss2si64:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vcvttss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
+; AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_sse_cvttss2si64:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vcvttss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
-; SKX-NEXT:    retq ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_sse_cvttss2si64:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vcvttss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
   %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
index 1ed4d3401ca..629e0aabbe1 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
@@ -1,7 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s   | FileCheck --check-prefix=SSE %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse4.1 < %s | FileCheck --check-prefix=SSE %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s    | FileCheck --check-prefix=AVX %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 ; PR21507 - https://llvm.org/bugs/show_bug.cgi?id=21507
 ; Each function should be a single math op; no extra moves.
@@ -11,12 +16,12 @@ define <4 x float> @recip(<4 x float> %x) {
 ; SSE-LABEL: recip:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpss %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: recip:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %y = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %x)
   %shuf = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %shuf
@@ -26,12 +31,12 @@ define <4 x float> @recip_square_root(<4 x float> %x) {
 ; SSE-LABEL: recip_square_root:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rsqrtss %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: recip_square_root:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %y = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %x)
   %shuf = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %shuf
@@ -41,12 +46,12 @@ define <4 x float> @square_root(<4 x float> %x) {
 ; SSE-LABEL: square_root:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtss %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: square_root:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %y = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x)
   %shuf = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %shuf
@@ -56,12 +61,12 @@ define <2 x double> @square_root_double(<2 x double> %x) {
 ; SSE-LABEL: square_root_double:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtsd %xmm0, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: square_root_double:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %y = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %x)
   %shuf = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %shuf
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 476d1befe1d..1a294daf1ea 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 ; Ensure that the backend no longer emits unnecessary vector insert
 ; instructions immediately after SSE scalar fp instructions
@@ -12,12 +16,12 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: test_add_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_add_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %add = fadd float %2, %1
@@ -29,12 +33,12 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: test_sub_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %sub = fsub float %2, %1
@@ -46,12 +50,12 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: test_mul_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_mul_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %mul = fmul float %2, %1
@@ -63,12 +67,12 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: test_div_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %div = fdiv float %2, %1
@@ -81,25 +85,25 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    sqrtss %xmm0, %xmm1
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
+; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; SSE41-LABEL: test_sqrt_ss:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    sqrtss %xmm0, %xmm1
 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE41-NEXT:    ret{{[l|q]}}
 ;
 ; AVX1-LABEL: test_sqrt_ss:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX1-NEXT:    retq
+; AVX1-NEXT:    ret{{[l|q]}}
 ;
 ; AVX512-LABEL: test_sqrt_ss:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm1
 ; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX512-NEXT:    retq
+; AVX512-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %a, i32 0
   %2 = call float @llvm.sqrt.f32(float %1)
   %3 = insertelement <4 x float> %a, float %2, i32 0
@@ -111,12 +115,12 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: test_add_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_add_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %add = fadd double %2, %1
@@ -128,12 +132,12 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: test_sub_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %sub = fsub double %2, %1
@@ -145,12 +149,12 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: test_mul_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_mul_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %mul = fmul double %2, %1
@@ -162,12 +166,12 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: test_div_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %div = fdiv double %2, %1
@@ -180,25 +184,25 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    sqrtsd %xmm0, %xmm1
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
+; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; SSE41-LABEL: test_sqrt_sd:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    sqrtsd %xmm0, %xmm1
 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE41-NEXT:    retq
+; SSE41-NEXT:    ret{{[l|q]}}
 ;
 ; AVX1-LABEL: test_sqrt_sd:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; AVX1-NEXT:    retq
+; AVX1-NEXT:    ret{{[l|q]}}
 ;
 ; AVX512-LABEL: test_sqrt_sd:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
 ; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; AVX512-NEXT:    retq
+; AVX512-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %a, i32 0
   %2 = call double @llvm.sqrt.f64(double %1)
   %3 = insertelement <2 x double> %a, double %2, i32 0
@@ -211,12 +215,12 @@ define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_add_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %add = fadd float %1, %2
@@ -229,12 +233,12 @@ define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %sub = fsub float %2, %1
@@ -247,12 +251,12 @@ define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_mul_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %mul = fmul float %1, %2
@@ -265,12 +269,12 @@ define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %div = fdiv float %2, %1
@@ -283,12 +287,12 @@ define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_add_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %add = fadd double %1, %2
@@ -301,12 +305,12 @@ define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %sub = fsub double %2, %1
@@ -319,12 +323,12 @@ define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_mul_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %mul = fmul double %1, %2
@@ -337,12 +341,12 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test2_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %div = fdiv double %2, %1
@@ -355,13 +359,13 @@ define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addss %xmm0, %xmm1
 ; SSE-NEXT:    addss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_multiple_add_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %add = fadd float %2, %1
@@ -376,13 +380,13 @@ define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    subss %xmm1, %xmm2
 ; SSE-NEXT:    subss %xmm2, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_multiple_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %sub = fsub float %2, %1
@@ -396,13 +400,13 @@ define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss %xmm0, %xmm1
 ; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_multiple_mul_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %mul = fmul float %2, %1
@@ -417,13 +421,13 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    divss %xmm1, %xmm2
 ; SSE-NEXT:    divss %xmm2, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: test_multiple_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %div = fdiv float %2, %1
@@ -436,15 +440,27 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
 ; be lowered to X86Blendi nodes.
 
 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
-; SSE-LABEL: blend_add_ss:
-; SSE:       # %bb.0:
-; SSE-NEXT:    addss %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_add_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_add_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    addss %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_add_ss:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_add_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addss %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_add_ss:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <4 x float> %a, i32 0
   %op = fadd float %b, %ext
@@ -454,15 +470,27 @@ define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
 }
 
 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
-; SSE-LABEL: blend_sub_ss:
-; SSE:       # %bb.0:
-; SSE-NEXT:    subss %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_sub_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_sub_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    subss %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_sub_ss:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_sub_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    subss %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_sub_ss:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <4 x float> %a, i32 0
   %op = fsub float %ext, %b
@@ -472,15 +500,27 @@ define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
 }
 
 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
-; SSE-LABEL: blend_mul_ss:
-; SSE:       # %bb.0:
-; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_mul_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_mul_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    mulss %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_mul_ss:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_mul_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    mulss %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_mul_ss:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <4 x float> %a, i32 0
   %op = fmul float %b, %ext
@@ -490,15 +530,27 @@ define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
 }
 
 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
-; SSE-LABEL: blend_div_ss:
-; SSE:       # %bb.0:
-; SSE-NEXT:    divss %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_div_ss:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_div_ss:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    divss %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_div_ss:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_div_ss:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    divss %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_div_ss:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <4 x float> %a, i32 0
   %op = fdiv float %ext, %b
@@ -508,15 +560,27 @@ define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
 }
 
 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
-; SSE-LABEL: blend_add_sd:
-; SSE:       # %bb.0:
-; SSE-NEXT:    addsd %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_add_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_add_sd:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    addsd %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_add_sd:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_add_sd:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addsd %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_add_sd:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <2 x double> %a, i32 0
   %op = fadd double %b, %ext
@@ -526,15 +590,27 @@ define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
 }
 
 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
-; SSE-LABEL: blend_sub_sd:
-; SSE:       # %bb.0:
-; SSE-NEXT:    subsd %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_sub_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_sub_sd:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    subsd %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_sub_sd:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_sub_sd:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    subsd %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_sub_sd:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <2 x double> %a, i32 0
   %op = fsub double %ext, %b
@@ -544,15 +620,27 @@ define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
 }
 
 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
-; SSE-LABEL: blend_mul_sd:
-; SSE:       # %bb.0:
-; SSE-NEXT:    mulsd %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_mul_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_mul_sd:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    mulsd %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_mul_sd:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_mul_sd:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    mulsd %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_mul_sd:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <2 x double> %a, i32 0
   %op = fmul double %b, %ext
@@ -562,15 +650,27 @@ define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
 }
 
 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
-; SSE-LABEL: blend_div_sd:
-; SSE:       # %bb.0:
-; SSE-NEXT:    divsd %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: blend_div_sd:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; X86-SSE-LABEL: blend_div_sd:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    divsd %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: blend_div_sd:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: blend_div_sd:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    divsd %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: blend_div_sd:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
 
   %ext = extractelement <2 x double> %a, i32 0
   %op = fdiv double %ext, %b
@@ -586,12 +686,12 @@ define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test_add_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_add_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <4 x float> %a, %b
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -601,12 +701,12 @@ define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test_sub_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <4 x float> %a, %b
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -616,12 +716,12 @@ define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test_mul_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_mul_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <4 x float> %a, %b
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -631,12 +731,12 @@ define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test_div_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <4 x float> %a, %b
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -646,12 +746,12 @@ define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test_add_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_add_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <2 x double> %a, %b
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -661,12 +761,12 @@ define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test_sub_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <2 x double> %a, %b
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -676,12 +776,12 @@ define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test_mul_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_mul_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <2 x double> %a, %b
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -691,12 +791,12 @@ define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test_div_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <2 x double> %a, %b
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -707,12 +807,12 @@ define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_add_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -723,12 +823,12 @@ define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -739,12 +839,12 @@ define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_mul_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -755,12 +855,12 @@ define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   ret <4 x float> %2
@@ -771,12 +871,12 @@ define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_add_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -787,12 +887,12 @@ define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -803,12 +903,12 @@ define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_mul_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -819,12 +919,12 @@ define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test2_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %2
@@ -834,12 +934,12 @@ define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test3_add_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_add_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <4 x float> %a, %b
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
   ret <4 x float> %2
@@ -849,12 +949,12 @@ define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test3_sub_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <4 x float> %a, %b
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
   ret <4 x float> %2
@@ -864,12 +964,12 @@ define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test3_mul_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_mul_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <4 x float> %a, %b
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
   ret <4 x float> %2
@@ -879,12 +979,12 @@ define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: insert_test3_div_ss:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divss %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <4 x float> %a, %b
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
   ret <4 x float> %2
@@ -894,12 +994,12 @@ define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test3_add_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_add_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <2 x double> %a, %b
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
   ret <2 x double> %2
@@ -909,12 +1009,12 @@ define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test3_sub_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <2 x double> %a, %b
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
   ret <2 x double> %2
@@ -924,12 +1024,12 @@ define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test3_mul_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_mul_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <2 x double> %a, %b
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
   ret <2 x double> %2
@@ -939,12 +1039,12 @@ define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: insert_test3_div_sd:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divsd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test3_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <2 x double> %a, %b
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
   ret <2 x double> %2
@@ -955,12 +1055,12 @@ define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_add_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <4 x float> %b, %a
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   ret <4 x float> %2
@@ -971,12 +1071,12 @@ define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <4 x float> %b, %a
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   ret <4 x float> %2
@@ -987,12 +1087,12 @@ define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_mul_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <4 x float> %b, %a
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   ret <4 x float> %2
@@ -1003,12 +1103,12 @@ define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <4 x float> %b, %a
   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   ret <4 x float> %2
@@ -1019,12 +1119,12 @@ define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    addsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_add_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fadd <2 x double> %b, %a
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   ret <2 x double> %2
@@ -1035,12 +1135,12 @@ define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <2 x double> %b, %a
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   ret <2 x double> %2
@@ -1051,12 +1151,12 @@ define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_mul_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fmul <2 x double> %b, %a
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   ret <2 x double> %2
@@ -1067,58 +1167,100 @@ define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    divsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test4_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <2 x double> %b, %a
   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   ret <2 x double> %2
 }
 
 define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; SSE2-LABEL: add_ss_mask:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    testb $1, %dil
-; SSE2-NEXT:    jne .LBB62_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-; SSE2-NEXT:  .LBB62_1:
-; SSE2-NEXT:    addss %xmm0, %xmm1
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: add_ss_mask:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    testb $1, %dil
-; SSE41-NEXT:    jne .LBB62_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB62_1:
-; SSE41-NEXT:    addss %xmm0, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: add_ss_mask:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    testb $1, %dil
-; AVX1-NEXT:    je .LBB62_2
-; AVX1-NEXT:  # %bb.1:
-; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:  .LBB62_2:
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; AVX1-NEXT:    retq
-;
-; AVX512-LABEL: add_ss_mask:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw %edi, %k1
-; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
-; AVX512-NEXT:    vmovaps %xmm2, %xmm0
-; AVX512-NEXT:    retq
+; X86-SSE2-LABEL: add_ss_mask:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    jne .LBB62_1
+; X86-SSE2-NEXT:  # %bb.2:
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-SSE2-NEXT:    retl
+; X86-SSE2-NEXT:  .LBB62_1:
+; X86-SSE2-NEXT:    addss %xmm0, %xmm1
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: add_ss_mask:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE41-NEXT:    jne .LBB62_1
+; X86-SSE41-NEXT:  # %bb.2:
+; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-SSE41-NEXT:    retl
+; X86-SSE41-NEXT:  .LBB62_1:
+; X86-SSE41-NEXT:    addss %xmm0, %xmm1
+; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE41-NEXT:    retl
+;
+; X86-AVX1-LABEL: add_ss_mask:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    je .LBB62_2
+; X86-AVX1-NEXT:  # %bb.1:
+; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:  .LBB62_2:
+; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: add_ss_mask:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-AVX512-NEXT:    kmovw %eax, %k1
+; X86-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
+; X86-AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE2-LABEL: add_ss_mask:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    testb $1, %dil
+; X64-SSE2-NEXT:    jne .LBB62_1
+; X64-SSE2-NEXT:  # %bb.2:
+; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-SSE2-NEXT:    retq
+; X64-SSE2-NEXT:  .LBB62_1:
+; X64-SSE2-NEXT:    addss %xmm0, %xmm1
+; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE41-LABEL: add_ss_mask:
+; X64-SSE41:       # %bb.0:
+; X64-SSE41-NEXT:    testb $1, %dil
+; X64-SSE41-NEXT:    jne .LBB62_1
+; X64-SSE41-NEXT:  # %bb.2:
+; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-SSE41-NEXT:    retq
+; X64-SSE41-NEXT:  .LBB62_1:
+; X64-SSE41-NEXT:    addss %xmm0, %xmm1
+; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-SSE41-NEXT:    retq
+;
+; X64-AVX1-LABEL: add_ss_mask:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    testb $1, %dil
+; X64-AVX1-NEXT:    je .LBB62_2
+; X64-AVX1-NEXT:  # %bb.1:
+; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:  .LBB62_2:
+; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: add_ss_mask:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    kmovw %edi, %k1
+; X64-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
+; X64-AVX512-NEXT:    vmovaps %xmm2, %xmm0
+; X64-AVX512-NEXT:    retq
   %1 = extractelement <4 x float> %a, i64 0
   %2 = extractelement <4 x float> %b, i64 0
   %3 = fadd float %1, %2
@@ -1131,46 +1273,88 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 }
 
 define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; SSE2-LABEL: add_sd_mask:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    testb $1, %dil
-; SSE2-NEXT:    jne .LBB63_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT:    retq
-; SSE2-NEXT:  .LBB63_1:
-; SSE2-NEXT:    addsd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: add_sd_mask:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    testb $1, %dil
-; SSE41-NEXT:    jne .LBB63_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB63_1:
-; SSE41-NEXT:    addsd %xmm0, %xmm1
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: add_sd_mask:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    testb $1, %dil
-; AVX1-NEXT:    je .LBB63_2
-; AVX1-NEXT:  # %bb.1:
-; AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:  .LBB63_2:
-; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; AVX1-NEXT:    retq
-;
-; AVX512-LABEL: add_sd_mask:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    kmovw %edi, %k1
-; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
-; AVX512-NEXT:    vmovapd %xmm2, %xmm0
-; AVX512-NEXT:    retq
+; X86-SSE2-LABEL: add_sd_mask:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    jne .LBB63_1
+; X86-SSE2-NEXT:  # %bb.2:
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-SSE2-NEXT:    retl
+; X86-SSE2-NEXT:  .LBB63_1:
+; X86-SSE2-NEXT:    addsd %xmm0, %xmm1
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: add_sd_mask:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE41-NEXT:    jne .LBB63_1
+; X86-SSE41-NEXT:  # %bb.2:
+; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; X86-SSE41-NEXT:    retl
+; X86-SSE41-NEXT:  .LBB63_1:
+; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
+; X86-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE41-NEXT:    retl
+;
+; X86-AVX1-LABEL: add_sd_mask:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    je .LBB63_2
+; X86-AVX1-NEXT:  # %bb.1:
+; X86-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:  .LBB63_2:
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: add_sd_mask:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-AVX512-NEXT:    kmovw %eax, %k1
+; X86-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
+; X86-AVX512-NEXT:    vmovapd %xmm2, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE2-LABEL: add_sd_mask:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    testb $1, %dil
+; X64-SSE2-NEXT:    jne .LBB63_1
+; X64-SSE2-NEXT:  # %bb.2:
+; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-SSE2-NEXT:    retq
+; X64-SSE2-NEXT:  .LBB63_1:
+; X64-SSE2-NEXT:    addsd %xmm0, %xmm1
+; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE41-LABEL: add_sd_mask:
+; X64-SSE41:       # %bb.0:
+; X64-SSE41-NEXT:    testb $1, %dil
+; X64-SSE41-NEXT:    jne .LBB63_1
+; X64-SSE41-NEXT:  # %bb.2:
+; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; X64-SSE41-NEXT:    retq
+; X64-SSE41-NEXT:  .LBB63_1:
+; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
+; X64-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE41-NEXT:    retq
+;
+; X64-AVX1-LABEL: add_sd_mask:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    testb $1, %dil
+; X64-AVX1-NEXT:    je .LBB63_2
+; X64-AVX1-NEXT:  # %bb.1:
+; X64-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:  .LBB63_2:
+; X64-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: add_sd_mask:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    kmovw %edi, %k1
+; X64-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
+; X64-AVX512-NEXT:    vmovapd %xmm2, %xmm0
+; X64-AVX512-NEXT:    retq
   %1 = extractelement <2 x double> %a, i64 0
   %2 = extractelement <2 x double> %b, i64 0
   %3 = fadd double %1, %2
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index b405b8aa2f5..2859387de04 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse -O3 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+sse -O3 | FileCheck %s --check-prefixes=CHECK,X64
+
 ; Tests for SSE1 and below, without SSE2+.
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 -O3 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2,+sse -O3 | FileCheck %s --check-prefix=X64
 
 ; PR7993
 ;define <4 x i32> @test3(<4 x i16> %a) nounwind {
@@ -13,25 +14,15 @@
 ; vector that this ends up returning.
 ; rdar://8368414
 define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
-; X32-LABEL: test4:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movaps %xmm0, %xmm2
-; X32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
-; X32-NEXT:    addss %xmm1, %xmm0
-; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X32-NEXT:    subss %xmm1, %xmm2
-; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test4:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movaps %xmm0, %xmm2
-; X64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
-; X64-NEXT:    addss %xmm1, %xmm0
-; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X64-NEXT:    subss %xmm1, %xmm2
-; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT:    retq
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps %xmm0, %xmm2
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    subss %xmm1, %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
   %tmp5 = extractelement <2 x float> %A, i32 1
@@ -51,44 +42,44 @@ entry:
 ; PR18036
 
 define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
-; X32-LABEL: vselect:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    je .LBB1_1
-; X32-NEXT:  # %bb.2: # %entry
-; X32-NEXT:    xorps %xmm1, %xmm1
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    jne .LBB1_5
-; X32-NEXT:  .LBB1_4:
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    jne .LBB1_8
-; X32-NEXT:  .LBB1_7:
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X32-NEXT:    je .LBB1_10
-; X32-NEXT:    jmp .LBB1_11
-; X32-NEXT:  .LBB1_1:
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB1_4
-; X32-NEXT:  .LBB1_5: # %entry
-; X32-NEXT:    xorps %xmm2, %xmm2
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    je .LBB1_7
-; X32-NEXT:  .LBB1_8: # %entry
-; X32-NEXT:    xorps %xmm3, %xmm3
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X32-NEXT:    jne .LBB1_11
-; X32-NEXT:  .LBB1_10:
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:  .LBB1_11: # %entry
-; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X32-NEXT:    retl
+; X86-LABEL: vselect:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    je .LBB1_1
+; X86-NEXT:  # %bb.2: # %entry
+; X86-NEXT:    xorps %xmm1, %xmm1
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB1_5
+; X86-NEXT:  .LBB1_4:
+; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    jne .LBB1_8
+; X86-NEXT:  .LBB1_7:
+; X86-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT:    je .LBB1_10
+; X86-NEXT:    jmp .LBB1_11
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB1_4
+; X86-NEXT:  .LBB1_5: # %entry
+; X86-NEXT:    xorps %xmm2, %xmm2
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    je .LBB1_7
+; X86-NEXT:  .LBB1_8: # %entry
+; X86-NEXT:    xorps %xmm3, %xmm3
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT:    jne .LBB1_11
+; X86-NEXT:  .LBB1_10:
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:  .LBB1_11: # %entry
+; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: vselect:
 ; X64:       # %bb.0: # %entry
@@ -137,15 +128,10 @@ entry:
 ; v4i32 isn't legal for SSE1, but this should be cmpps.
 
 define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
-; X32-LABEL: PR28044:
-; X32:       # %bb.0:
-; X32-NEXT:    cmpeqps %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: PR28044:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpeqps %xmm1, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: PR28044:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpeqps %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %cmp = fcmp oeq <4 x float> %a0, %a1
   %sext = sext <4 x i1> %cmp to <4 x i32>
   %res = bitcast <4 x i32> %sext to <4 x float>
@@ -156,51 +142,51 @@ define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
 ; https://llvm.org/bugs/show_bug.cgi?id=30512
 
 define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
-; X32-LABEL: PR30512:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $16, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    xorl %ebx, %ebx
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    sete %bl
-; X32-NEXT:    negl %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    xorl %ebx, %ebx
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    sete %bl
-; X32-NEXT:    negl %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    xorl %ebx, %ebx
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    sete %bl
-; X32-NEXT:    negl %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    sete %dl
-; X32-NEXT:    negl %edx
-; X32-NEXT:    movl %edx, (%esp)
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X32-NEXT:    andps {{\.LCPI.*}}, %xmm2
-; X32-NEXT:    movaps %xmm2, (%eax)
-; X32-NEXT:    addl $16, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    retl $4
+; X86-LABEL: PR30512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sete %bl
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sete %bl
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sete %bl
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sete %dl
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %edx, (%esp)
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm2
+; X86-NEXT:    movaps %xmm2, (%eax)
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: PR30512:
 ; X64:       # %bb.0:
@@ -250,10 +236,10 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; scalarizing it anyway.
 
 define <2 x float> @PR31672() #0 {
-; X32-LABEL: PR31672:
-; X32:       # %bb.0:
-; X32-NEXT:    sqrtps {{\.LCPI.*}}, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: PR31672:
+; X86:       # %bb.0:
+; X86-NEXT:    sqrtps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR31672:
 ; X64:       # %bb.0:
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2018-06-02 20:25:56 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2018-06-02 20:25:56 +0000
commit	58ff2ecc4be562b7302309338a5c0bf980eb93f9 (patch)
tree	2b2cd63e486b5f5f5e818d3fd1a1b5aa30220878
parent	87908448486b6d076bad6802d82ae544577acdcf (diff)
download	bcm5719-llvm-58ff2ecc4be562b7302309338a5c0bf980eb93f9.tar.gz bcm5719-llvm-58ff2ecc4be562b7302309338a5c0bf980eb93f9.zip