4 files changed, 492 insertions, 415 deletions
diff --git a/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
index 5bf36a51c76..653a3a31d04 100644
--- a/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
@@ -1,110 +1,125 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
 
 define <2 x double> @test_mm_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; X32-LABEL: test_mm_addsub_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    addsubpd %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_addsub_pd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addsubpd %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_addsub_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    addsubpd %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_addsub_pd:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
 
 define <4 x float> @test_mm_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_addsub_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    addsubps %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_addsub_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addsubps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_addsub_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    addsubps %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_addsub_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define <2 x double> @test_mm_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
-; X32-LABEL: test_mm_hadd_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    haddpd %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hadd_pd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    haddpd %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hadd_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    haddpd %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hadd_pd:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
 
 define <4 x float> @test_mm_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_hadd_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    haddps %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hadd_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    haddps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hadd_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    haddps %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hadd_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define <2 x double> @test_mm_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; X32-LABEL: test_mm_hsub_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    hsubpd %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hsub_pd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    hsubpd %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hsub_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    hsubpd %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hsub_pd:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
 
 define <4 x float> @test_mm_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; X32-LABEL: test_mm_hsub_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    hsubps %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hsub_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    hsubps %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hsub_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    hsubps %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hsub_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define <2 x i64> @test_mm_lddqu_si128(<2 x i64>* %a0) {
-; X32-LABEL: test_mm_lddqu_si128:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    lddqu (%eax), %xmm0
-; X32-NEXT:    retl
+; X86-SSE-LABEL: test_mm_lddqu_si128:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    lddqu (%eax), %xmm0
+; X86-SSE-NEXT:    retl
 ;
-; X64-LABEL: test_mm_lddqu_si128:
-; X64:       # %bb.0:
-; X64-NEXT:    lddqu (%rdi), %xmm0
-; X64-NEXT:    retq
+; X86-AVX-LABEL: test_mm_lddqu_si128:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vlddqu (%eax), %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_lddqu_si128:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    lddqu (%rdi), %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_mm_lddqu_si128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vlddqu (%rdi), %xmm0
+; X64-AVX-NEXT:    retq
   %bc = bitcast <2 x i64>* %a0 to i8*
   %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %bc)
   %res = bitcast <16 x i8> %call to <2 x i64>
@@ -113,16 +128,27 @@ define <2 x i64> @test_mm_lddqu_si128(<2 x i64>* %a0) {
 declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
 
 define <2 x double> @test_mm_loaddup_pd(double* %a0) {
-; X32-LABEL: test_mm_loaddup_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
-; X32-NEXT:    retl
+; X86-SSE-LABEL: test_mm_loaddup_pd:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_mm_loaddup_pd:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_mm_loaddup_pd:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-SSE-NEXT:    retq
 ;
-; X64-LABEL: test_mm_loaddup_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
-; X64-NEXT:    retq
+; X64-AVX-LABEL: test_mm_loaddup_pd:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT:    retq
   %ld = load double, double* %a0
   %res0 = insertelement <2 x double> undef, double %ld, i32 0
   %res1 = insertelement <2 x double> %res0, double %ld, i32 1
@@ -130,43 +156,43 @@ define <2 x double> @test_mm_loaddup_pd(double* %a0) {
 }
 
 define <2 x double> @test_mm_movedup_pd(<2 x double> %a0) {
-; X32-LABEL: test_mm_movedup_pd:
-; X32:       # %bb.0:
-; X32-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_movedup_pd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_movedup_pd:
-; X64:       # %bb.0:
-; X64-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_movedup_pd:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> zeroinitializer
   ret <2 x double> %res
 }
 
 define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
-; X32-LABEL: test_mm_movehdup_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_movehdup_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_movehdup_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_movehdup_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   ret <4 x float> %res
 }
 
 define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
-; X32-LABEL: test_mm_moveldup_ps:
-; X32:       # %bb.0:
-; X32-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_moveldup_ps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_moveldup_ps:
-; X64:       # %bb.0:
-; X64-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_moveldup_ps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   ret <4 x float> %res
 }
diff --git a/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll
index 18bd2195cb9..f97bf08101f 100644
--- a/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll
@@ -1,18 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse3 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse3 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_x86_sse3_addsub_pd:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    addsubpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd0,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse3_addsub_pd:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd0,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse3_addsub_pd:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd0,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -23,12 +26,12 @@ define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse3_addsub_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    addsubps %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0xd0,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse3_addsub_ps:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xd0,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse3_addsub_ps:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xd0,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -39,12 +42,12 @@ define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_x86_sse3_hadd_pd:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    haddpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x7c,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse3_hadd_pd:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7c,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse3_hadd_pd:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7c,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -55,12 +58,12 @@ define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse3_hadd_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    haddps %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x7c,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse3_hadd_ps:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7c,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse3_hadd_ps:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7c,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -71,12 +74,12 @@ define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_x86_sse3_hsub_pd:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    hsubpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x7d,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse3_hsub_pd:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7d,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse3_hsub_pd:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7d,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -87,12 +90,12 @@ define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: test_x86_sse3_hsub_ps:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    hsubps %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x7d,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse3_hsub_ps:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7d,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_sse3_hsub_ps:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7d,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -100,17 +103,27 @@ declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind re
 
 
 define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
-; SSE-LABEL: test_x86_sse3_ldu_dq:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SSE-NEXT:    lddqu (%eax), %xmm0 ## encoding: [0xf2,0x0f,0xf0,0x00]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; X86-SSE-LABEL: test_x86_sse3_ldu_dq:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    lddqu (%eax), %xmm0 ## encoding: [0xf2,0x0f,0xf0,0x00]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX-LABEL: test_x86_sse3_ldu_dq:
+; X86-AVX:       ## %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vlddqu (%eax), %xmm0 ## encoding: [0xc5,0xfb,0xf0,0x00]
+; X86-AVX-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse3_ldu_dq:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    lddqu (%rdi), %xmm0 ## encoding: [0xf2,0x0f,0xf0,0x07]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse3_ldu_dq:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; VCHECK-NEXT:    vlddqu (%eax), %xmm0 ## encoding: [0xc5,0xfb,0xf0,0x00]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; X64-AVX-LABEL: test_x86_sse3_ldu_dq:
+; X64-AVX:       ## %bb.0:
+; X64-AVX-NEXT:    vlddqu (%rdi), %xmm0 ## encoding: [0xc5,0xfb,0xf0,0x07]
+; X64-AVX-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -119,26 +132,40 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
 ; Make sure instructions with no AVX equivalents, but are associated with SSEX feature flags still work
 
 define void @monitor(i8* %P, i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: monitor:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x0c]
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    leal (%eax), %eax ## encoding: [0x8d,0x00]
-; CHECK-NEXT:    monitor ## encoding: [0x0f,0x01,0xc8]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
+; X86-LABEL: monitor:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    leal (%eax), %eax ## encoding: [0x8d,0x00]
+; X86-NEXT:    monitor ## encoding: [0x0f,0x01,0xc8]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: monitor:
+; X64:       ## %bb.0:
+; X64-NEXT:    leaq (%rdi), %rax ## encoding: [0x48,0x8d,0x07]
+; X64-NEXT:    movl %esi, %ecx ## encoding: [0x89,0xf1]
+; X64-NEXT:    monitor ## encoding: [0x0f,0x01,0xc8]
+; X64-NEXT:    retq ## encoding: [0xc3]
   tail call void @llvm.x86.sse3.monitor(i8* %P, i32 %E, i32 %H)
   ret void
 }
 declare void @llvm.x86.sse3.monitor(i8*, i32, i32) nounwind
 
 define void @mwait(i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: mwait:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; CHECK-NEXT:    mwait ## encoding: [0x0f,0x01,0xc9]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
+; X86-LABEL: mwait:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    mwait ## encoding: [0x0f,0x01,0xc9]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: mwait:
+; X64:       ## %bb.0:
+; X64-NEXT:    movl %edi, %ecx ## encoding: [0x89,0xf9]
+; X64-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
+; X64-NEXT:    mwait ## encoding: [0x0f,0x01,0xc9]
+; X64-NEXT:    retq ## encoding: [0xc3]
   tail call void @llvm.x86.sse3.mwait(i32 %E, i32 %H)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
index 74c5924b600..b0529640eb1 100644
--- a/llvm/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
@@ -1,22 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/ssse3-builtins.c
 
 define <2 x i64> @test_mm_abs_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_abs_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    pabsb %xmm0, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_abs_epi8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pabsb %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_abs_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    pabsb %xmm0, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_abs_epi8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsb %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg = bitcast <2 x i64> %a0 to <16 x i8>
   %sub = sub <16 x i8> zeroinitializer, %arg
-  %cmp = icmp sgt <16 x i8> %arg, zeroinitializer 
+  %cmp = icmp sgt <16 x i8> %arg, zeroinitializer
   %sel = select <16 x i1> %cmp, <16 x i8> %arg, <16 x i8> %sub
   %res = bitcast <16 x i8> %sel to <2 x i64>
   ret <2 x i64> %res
@@ -24,18 +28,18 @@ define <2 x i64> @test_mm_abs_epi8(<2 x i64> %a0) {
 declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_abs_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_abs_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    pabsw %xmm0, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_abs_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pabsw %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_abs_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    pabsw %xmm0, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_abs_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsw %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg = bitcast <2 x i64> %a0 to <8 x i16>
   %sub = sub <8 x i16> zeroinitializer, %arg
-  %cmp = icmp sgt <8 x i16> %arg, zeroinitializer 
+  %cmp = icmp sgt <8 x i16> %arg, zeroinitializer
   %sel = select <8 x i1> %cmp, <8 x i16> %arg, <8 x i16> %sub
   %res = bitcast <8 x i16> %sel to <2 x i64>
   ret <2 x i64> %res
@@ -43,18 +47,18 @@ define <2 x i64> @test_mm_abs_epi16(<2 x i64> %a0) {
 declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_abs_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm_abs_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    pabsd %xmm0, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_abs_epi32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pabsd %xmm0, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_abs_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    pabsd %xmm0, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_abs_epi32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsd %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg = bitcast <2 x i64> %a0 to <4 x i32>
   %sub = sub <4 x i32> zeroinitializer, %arg
-  %cmp = icmp sgt <4 x i32> %arg, zeroinitializer 
+  %cmp = icmp sgt <4 x i32> %arg, zeroinitializer
   %sel = select <4 x i1> %cmp, <4 x i32> %arg, <4 x i32> %sub
   %res = bitcast <4 x i32> %sel to <2 x i64>
   ret <2 x i64> %res
@@ -62,17 +66,16 @@ define <2 x i64> @test_mm_abs_epi32(<2 x i64> %a0) {
 declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_alignr_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
-; X32-NEXT:    movdqa %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_alignr_epi8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_alignr_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
-; X64-NEXT:    movdqa %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_alignr_epi8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
@@ -81,17 +84,16 @@ define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 }
 
 define <2 x i64> @test2_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test2_mm_alignr_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
-; X32-NEXT:    movdqa %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test2_mm_alignr_epi8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test2_mm_alignr_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
-; X64-NEXT:    movdqa %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test2_mm_alignr_epi8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
@@ -100,15 +102,15 @@ define <2 x i64> @test2_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 }
 
 define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_hadd_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    phaddw %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hadd_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    phaddw %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hadd_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    phaddw %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hadd_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %call = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -118,15 +120,15 @@ define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_hadd_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_hadd_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    phaddd %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hadd_epi32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    phaddd %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hadd_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    phaddd %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hadd_epi32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %call = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -136,15 +138,15 @@ define <2 x i64> @test_mm_hadd_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_hadds_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_hadds_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    phaddsw %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hadds_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    phaddsw %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hadds_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    phaddsw %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hadds_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %call = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -154,15 +156,15 @@ define <2 x i64> @test_mm_hadds_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_hsub_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_hsub_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    phsubw %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hsub_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    phsubw %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hsub_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    phsubw %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hsub_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %call = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -172,15 +174,15 @@ define <2 x i64> @test_mm_hsub_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_hsub_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_hsub_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    phsubd %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hsub_epi32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    phsubd %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hsub_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    phsubd %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hsub_epi32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %call = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -190,15 +192,15 @@ define <2 x i64> @test_mm_hsub_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @test_mm_hsubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_hsubs_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    phsubsw %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_hsubs_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    phsubsw %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_hsubs_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    phsubsw %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_hsubs_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %call = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -208,15 +210,15 @@ define <2 x i64> @test_mm_hsubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_maddubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_maddubs_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    pmaddubsw %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_maddubs_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmaddubsw %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_maddubs_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    pmaddubsw %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_maddubs_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %call = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %arg0, <16 x i8> %arg1)
@@ -226,15 +228,15 @@ define <2 x i64> @test_mm_maddubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_mulhrs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_mulhrs_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    pmulhrsw %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_mulhrs_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmulhrsw %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_mulhrs_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    pmulhrsw %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_mulhrs_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %call = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -244,15 +246,15 @@ define <2 x i64> @test_mm_mulhrs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_shuffle_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_shuffle_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    pshufb %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_shuffle_epi8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufb %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_shuffle_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    pshufb %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_shuffle_epi8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %call = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
@@ -262,15 +264,15 @@ define <2 x i64> @test_mm_shuffle_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_sign_epi8(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sign_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    psignb %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_sign_epi8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psignb %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_sign_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    psignb %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_sign_epi8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsignb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   %call = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %arg0, <16 x i8> %arg1)
@@ -280,15 +282,15 @@ define <2 x i64> @test_mm_sign_epi8(<2 x i64> %a0, <2 x i64> %a1) {
 declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
 
 define <2 x i64> @test_mm_sign_epi16(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sign_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    psignw %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_sign_epi16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psignw %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_sign_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    psignw %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_sign_epi16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsignw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   %call = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %arg0, <8 x i16> %arg1)
@@ -298,15 +300,15 @@ define <2 x i64> @test_mm_sign_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <2 x i64> @test_mm_sign_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sign_epi32:
-; X32:       # %bb.0:
-; X32-NEXT:    psignd %xmm1, %xmm0
-; X32-NEXT:    retl
+; SSE-LABEL: test_mm_sign_epi32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psignd %xmm1, %xmm0
+; SSE-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: test_mm_sign_epi32:
-; X64:       # %bb.0:
-; X64-NEXT:    psignd %xmm1, %xmm0
-; X64-NEXT:    retq
+; AVX-LABEL: test_mm_sign_epi32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsignd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   %call = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %arg0, <4 x i32> %arg1)
diff --git a/llvm/test/CodeGen/X86/ssse3-intrinsics-x86.ll b/llvm/test/CodeGen/X86/ssse3-intrinsics-x86.ll
index 66265d63a97..629a759332a 100644
--- a/llvm/test/CodeGen/X86/ssse3-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/ssse3-intrinsics-x86.ll
@@ -1,23 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+ssse3 -show-mc-encoding | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+ssse3 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+ssse3 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
 define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
 ; SSE-LABEL: test_x86_ssse3_pabs_b_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pabsb %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x1c,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_ssse3_pabs_b_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpabsb %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_ssse3_pabs_b_128:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpabsb %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_ssse3_pabs_b_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpabsb %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_ssse3_pabs_b_128:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpabsb %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -28,17 +31,17 @@ define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
 ; SSE-LABEL: test_x86_ssse3_pabs_d_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pabsd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x1e,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_ssse3_pabs_d_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpabsd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_ssse3_pabs_d_128:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpabsd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_ssse3_pabs_d_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpabsd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_ssse3_pabs_d_128:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpabsd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -49,17 +52,17 @@ define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
 ; SSE-LABEL: test_x86_ssse3_pabs_w_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pabsw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x1d,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_ssse3_pabs_w_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpabsw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_ssse3_pabs_w_128:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpabsw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_ssse3_pabs_w_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpabsw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_ssse3_pabs_w_128:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpabsw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
@@ -70,12 +73,12 @@ define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: test_x86_ssse3_phadd_d_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    phaddd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x02,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_phadd_d_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x02,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_phadd_d_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x02,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -86,12 +89,12 @@ define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_ssse3_phadd_sw_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    phaddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x03,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_phadd_sw_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x03,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_phadd_sw_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x03,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
@@ -102,12 +105,12 @@ define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_ssse3_phadd_w_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    phaddw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x01,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_phadd_w_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x01,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_phadd_w_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x01,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
@@ -118,12 +121,12 @@ define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: test_x86_ssse3_phsub_d_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    phsubd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x06,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_phsub_d_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x06,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_phsub_d_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x06,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -134,12 +137,12 @@ define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_ssse3_phsub_sw_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    phsubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x07,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_phsub_sw_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x07,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_phsub_sw_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x07,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
@@ -150,12 +153,12 @@ define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_ssse3_phsub_w_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    phsubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x05,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_phsub_w_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x05,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_phsub_w_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x05,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
@@ -166,17 +169,17 @@ define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pmaddubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x04,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x04,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x04,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
@@ -185,27 +188,46 @@ declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind
 
 ; Make sure we don't commute this operation.
 define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128_load_op0(<16 x i8>* %ptr, <16 x i8> %a1) {
-; SSE-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SSE-NEXT:    movdqa (%eax), %xmm1 ## encoding: [0x66,0x0f,0x6f,0x08]
-; SSE-NEXT:    pmaddubsw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x38,0x04,0xc8]
-; SSE-NEXT:    movdqa %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; X86-SSE-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    movdqa (%eax), %xmm1 ## encoding: [0x66,0x0f,0x6f,0x08]
+; X86-SSE-NEXT:    pmaddubsw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x38,0x04,0xc8]
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc1]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    vmovdqa (%eax), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x08]
+; X86-AVX1-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0x04,0xc0]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
+; X86-AVX512-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    movdqa (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x6f,0x0f]
+; X64-SSE-NEXT:    pmaddubsw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x38,0x04,0xc8]
+; X64-SSE-NEXT:    movdqa %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc1]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT:    vmovdqa (%eax), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x08]
-; AVX2-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0x04,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; X64-AVX1-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vmovdqa (%rdi), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x0f]
+; X64-AVX1-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0x04,0xc0]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SKX-NEXT:    vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
-; SKX-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; X64-AVX512-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
+; X64-AVX512-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %a0 = load <16 x i8>, <16 x i8>* %ptr
   %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
@@ -216,17 +238,17 @@ define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_ssse3_pmul_hr_sw_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pmulhrsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x0b,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_ssse3_pmul_hr_sw_128:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_ssse3_pmul_hr_sw_128:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
@@ -237,17 +259,17 @@ define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: test_x86_ssse3_pshuf_b_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    pshufb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x00,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; AVX2-LABEL: test_x86_ssse3_pshuf_b_128:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc1]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
+; AVX1-LABEL: test_x86_ssse3_pshuf_b_128:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc1]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; SKX-LABEL: test_x86_ssse3_pshuf_b_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc1]
-; SKX-NEXT:    retl ## encoding: [0xc3]
+; AVX512-LABEL: test_x86_ssse3_pshuf_b_128:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc1]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -258,12 +280,12 @@ define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: test_x86_ssse3_psign_b_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    psignb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x08,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_psign_b_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x08,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_psign_b_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x08,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -274,12 +296,12 @@ define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: test_x86_ssse3_psign_d_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    psignd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x0a,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_psign_d_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0a,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_psign_d_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0a,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -290,12 +312,12 @@ define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_ssse3_psign_w_128:
 ; SSE:       ## %bb.0:
 ; SSE-NEXT:    psignw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x09,0xc1]
-; SSE-NEXT:    retl ## encoding: [0xc3]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_ssse3_psign_w_128:
-; VCHECK:       ## %bb.0:
-; VCHECK-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x09,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_ssse3_psign_w_128:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x09,0xc1]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }