15 files changed, 419 insertions, 252 deletions
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 0e445ba3a4e..2cb6449f79a 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -3018,10 +3018,12 @@ define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vsqrtpd %ymm0, %ymm0
 ; X64-NEXT:    retq
-  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
-  ret <4 x double> %res
+entry:
+  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
+  ret <4 x double> %0
 }
-declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
 
 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
 ; X32-LABEL: test_mm256_sqrt_ps:
@@ -3033,10 +3035,12 @@ define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vsqrtps %ymm0, %ymm0
 ; X64-NEXT:    retq
-  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
-  ret <8 x float> %res
+entry:
+  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
+  ret <8 x float> %0
 }
-declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
 
 define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
 ; X32-LABEL: test_mm256_store_pd:
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 21612506f85..2c6571b00d0 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -6,6 +6,36 @@
 
 ; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
 
+define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
 define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
 ; AVX-LABEL: test_x86_avx_vinsertf128_pd_256_1:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
index ace9ae24f27..2fd2b863859 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -622,39 +622,6 @@ define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
 }
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 
-
-define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; AVX-LABEL: test_x86_avx_sqrt_pd_256:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
-; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_sqrt_ps_256:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
-; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-
-
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
 ; AVX-LABEL: test_x86_avx_vpermilvar_pd:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index b119dbcdc9c..b12493a8918 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -6375,6 +6375,182 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8
 
+define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
+; X32-LABEL: test_mm_mask_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
+  ret <2 x double> %2
+}
+
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+
+define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
+; X32-LABEL: test_mm_maskz_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
+  ret <2 x double> %2
+}
+
+define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
+; X32-LABEL: test_mm256_mask_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
+  ret <4 x double> %2
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
+; X32-LABEL: test_mm256_maskz_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
+  ret <4 x double> %2
+}
+
+define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
+; X32-LABEL: test_mm_mask_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
+  ret <4 x float> %2
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
+; X32-LABEL: test_mm_maskz_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+
+define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
+; X32-LABEL: test_mm256_mask_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
+; X32-LABEL: test_mm256_maskz_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
+  ret <8 x float> %2
+}
+
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
+
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
+declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
 declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)
 declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index de021a369ac..a13148599d1 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -12070,3 +12070,40 @@ define <8 x i32> @test_expand_load_d_256(i8* %addr, <8 x i32> %data) {
   %res = call <8 x i32> @llvm.x86.avx512.mask.expand.load.d.256(i8* %addr, <8 x i32> %data, i8 -1)
   ret <8 x i32> %res
 }
+
+define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
+; X86-LABEL: test_sqrt_pd_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_sqrt_pd_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
+; X86-LABEL: test_sqrt_ps_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_sqrt_ps_256:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 19de714ce8f..6b4b68fb4c1 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -999,43 +999,6 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
 
-define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
-; X86-LABEL: test_sqrt_pd_256:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_sqrt_pd_256:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 %mask)
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-
-define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
-; X86-LABEL: test_sqrt_ps_256:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_sqrt_ps_256:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
-  ret <8 x float> %res
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-
 define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_getexp_pd_256:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll
index 7feb66525e2..c77c6adf2e8 100644
--- a/llvm/test/CodeGen/X86/fold-load-unops.ll
+++ b/llvm/test/CodeGen/X86/fold-load-unops.ll
@@ -165,12 +165,14 @@ define float @sqrtss_size(float* %a) optsize{
 define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
 ; SSE-LABEL: sqrtss_full_size:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    sqrtss (%rdi), %xmm0
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrtss_full_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
     %ld = load <4 x float>, <4 x float>* %a
     %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
@@ -197,12 +199,14 @@ define double @sqrtsd_size(double* %a) optsize {
 define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
 ; SSE-LABEL: sqrtsd_full_size:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    sqrtsd (%rdi), %xmm0
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrtsd_full_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vmovapd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
     %ld = load <2 x double>, <2 x double>* %a
     %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 16dedb74366..a660293d6f7 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -2075,10 +2075,10 @@ define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtps %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
-  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
+  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
 ; SSE-LABEL: test_mm_sqrt_ss:
@@ -2090,10 +2090,12 @@ define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
-  %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
-  ret <4 x float> %sqrt
+  %ext = extractelement <4 x float> %a0, i32 0
+  %sqrt = call float @llvm.sqrt.f32(float %ext)
+  %ins = insertelement <4 x float> %a0, float %sqrt, i32 0
+  ret <4 x float> %ins
 }
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+declare float @llvm.sqrt.f32(float) nounwind readnone
 
 define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
 ; X86-SSE-LABEL: test_mm_store_ps:
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
index 60a455ae148..d153d2f42d6 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -6,6 +6,44 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
+
+define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
+; SSE-LABEL: test_x86_sse_sqrt_ps:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse_sqrt_ps:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse_sqrt_ps:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
+; SSE-LABEL: test_x86_sse_sqrt_ss:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX-LABEL: test_x86_sse_sqrt_ss:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+
 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
 ; X86-SSE-LABEL: test_x86_sse_storeu_ps:
 ; X86-SSE:       ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
index 0014da6b2ec..63ccda1d66b 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -448,48 +448,6 @@ define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
 
-define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
-; SSE-LABEL: test_x86_sse_sqrt_ps:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX1-LABEL: test_x86_sse_sqrt_ps:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX512-LABEL: test_x86_sse_sqrt_ps:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
-; SSE-LABEL: test_x86_sse_sqrt_ss:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX1-LABEL: test_x86_sse_sqrt_ss:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX512-LABEL: test_x86_sse_sqrt_ss:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-
-
 define void @test_x86_sse_stmxcsr(i8* %a0) {
 ; X86-SSE-LABEL: test_x86_sse_stmxcsr:
 ; X86-SSE:       ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 1a294daf1ea..39bbb8f6538 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -83,26 +83,22 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @test_sqrt_ss(<4 x float> %a) {
 ; SSE2-LABEL: test_sqrt_ss:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    sqrtss %xmm0, %xmm1
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    sqrtss %xmm0, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; SSE41-LABEL: test_sqrt_ss:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    sqrtss %xmm0, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-NEXT:    sqrtss %xmm0, %xmm0
 ; SSE41-NEXT:    ret{{[l|q]}}
 ;
 ; AVX1-LABEL: test_sqrt_ss:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    ret{{[l|q]}}
 ;
 ; AVX512-LABEL: test_sqrt_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
   %1 = extractelement <4 x float> %a, i32 0
   %2 = call float @llvm.sqrt.f32(float %1)
@@ -182,26 +178,22 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @test_sqrt_sd(<2 x double> %a) {
 ; SSE2-LABEL: test_sqrt_sd:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    sqrtsd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    sqrtsd %xmm0, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; SSE41-LABEL: test_sqrt_sd:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    sqrtsd %xmm0, %xmm1
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    sqrtsd %xmm0, %xmm0
 ; SSE41-NEXT:    ret{{[l|q]}}
 ;
 ; AVX1-LABEL: test_sqrt_sd:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    ret{{[l|q]}}
 ;
 ; AVX512-LABEL: test_sqrt_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
   %1 = extractelement <2 x double> %a, i32 0
   %2 = call double @llvm.sqrt.f64(double %1)
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index abc45b02d0d..31046e94c34 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3720,10 +3720,10 @@ define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
+  %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind readnone
 
 define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; SSE-LABEL: test_mm_sqrt_sd:
@@ -3736,14 +3736,12 @@ define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwin
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    ret{{[l|q]}}
-  %call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
-  %ext0 = extractelement <2 x double> %call, i32 0
-  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
-  %ext1 = extractelement <2 x double> %a1, i32 1
-  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
-  ret <2 x double> %ins1
+  %ext = extractelement <2 x double> %a0, i32 0
+  %sqrt = call double @llvm.sqrt.f64(double %ext)
+  %ins = insertelement <2 x double> %a1, double %sqrt, i32 0
+  ret <2 x double> %ins
 }
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
 
 define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE-LABEL: test_mm_sra_epi16:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 91b19fc4fc4..f29b474ea0b 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -6,6 +6,89 @@
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
 
+
+define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
+; SSE-LABEL: test_x86_sse2_sqrt_pd:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX1-LABEL: test_x86_sse2_sqrt_pd:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
+; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_x86_sse2_sqrt_pd:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0]
+; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
+; SSE-LABEL: test_x86_sse2_sqrt_sd:
+; SSE:       ## %bb.0:
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
+; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX-LABEL: test_x86_sse2_sqrt_sd:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
+; X86-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    movapd (%eax), %xmm0 ## encoding: [0x66,0x0f,0x28,0x00]
+; X86-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    vmovapd (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x00]
+; X86-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00]
+; X86-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    movapd (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x28,0x07]
+; X64-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vmovapd (%rdi), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x07]
+; X64-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vmovapd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x07]
+; X64-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
+  %a1 = load <2 x double>, <2 x double>* %a0, align 16
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+
+
 define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
 ; SSE-LABEL: test_x86_sse2_psll_dq_bs:
 ; SSE:       ## %bb.0:
@@ -241,8 +324,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x57,0xc9]
-; X86-SSE-NEXT:    movhpd LCPI8_0, %xmm1 ## encoding: [0x66,0x0f,0x16,0x0d,A,A,A,A]
-; X86-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4
+; X86-SSE-NEXT:    movhpd LCPI11_0, %xmm1 ## encoding: [0x66,0x0f,0x16,0x0d,A,A,A,A]
+; X86-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4
 ; X86-SSE-NEXT:    ## xmm1 = xmm1[0],mem[0]
 ; X86-SSE-NEXT:    addpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x58,0xc8]
 ; X86-SSE-NEXT:    movupd %xmm1, (%eax) ## encoding: [0x66,0x0f,0x11,0x08]
@@ -252,8 +335,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9]
-; X86-AVX1-NEXT:    vmovhpd LCPI8_0, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A]
-; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4
+; X86-AVX1-NEXT:    vmovhpd LCPI11_0, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A]
+; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4
 ; X86-AVX1-NEXT:    ## xmm1 = xmm1[0],mem[0]
 ; X86-AVX1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
 ; X86-AVX1-NEXT:    vmovupd %xmm0, (%eax) ## encoding: [0xc5,0xf9,0x11,0x00]
@@ -262,8 +345,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; X86-AVX512-LABEL: test_x86_sse2_storeu_pd:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd LCPI8_0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A]
-; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4
+; X86-AVX512-NEXT:    vmovsd LCPI11_0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A]
+; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4
 ; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero
 ; X86-AVX512-NEXT:    vpslldq $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08]
 ; X86-AVX512-NEXT:    ## xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
@@ -275,7 +358,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; X64-SSE:       ## %bb.0:
 ; X64-SSE-NEXT:    xorpd %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x57,0xc9]
 ; X64-SSE-NEXT:    movhpd {{.*}}(%rip), %xmm1 ## encoding: [0x66,0x0f,0x16,0x0d,A,A,A,A]
-; X64-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI8_0-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI11_0-4, kind: reloc_riprel_4byte
 ; X64-SSE-NEXT:    ## xmm1 = xmm1[0],mem[0]
 ; X64-SSE-NEXT:    addpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x58,0xc8]
 ; X64-SSE-NEXT:    movupd %xmm1, (%rdi) ## encoding: [0x66,0x0f,0x11,0x0f]
@@ -285,7 +368,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9]
 ; X64-AVX1-NEXT:    vmovhpd {{.*}}(%rip), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A]
-; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI8_0-4, kind: reloc_riprel_4byte
+; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI11_0-4, kind: reloc_riprel_4byte
 ; X64-AVX1-NEXT:    ## xmm1 = xmm1[0],mem[0]
 ; X64-AVX1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
 ; X64-AVX1-NEXT:    vmovupd %xmm0, (%rdi) ## encoding: [0xc5,0xf9,0x11,0x07]
@@ -294,7 +377,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; X64-AVX512-LABEL: test_x86_sse2_storeu_pd:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vmovsd {{.*}}(%rip), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A]
-; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI8_0-4, kind: reloc_riprel_4byte
+; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI11_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512-NEXT:    ## xmm1 = mem[0],zero
 ; X64-AVX512-NEXT:    vpslldq $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08]
 ; X64-AVX512-NEXT:    ## xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 337e3ac4ab6..5e7b351ad13 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1607,93 +1607,6 @@ define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 
-define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
-; SSE-LABEL: test_x86_sse2_sqrt_pd:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX1-LABEL: test_x86_sse2_sqrt_pd:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX512-LABEL: test_x86_sse2_sqrt_pd:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
-; SSE-LABEL: test_x86_sse2_sqrt_sd:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX1-LABEL: test_x86_sse2_sqrt_sd:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-;
-; AVX512-LABEL: test_x86_sse2_sqrt_sd:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
-; X86-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movapd (%eax), %xmm0 ## encoding: [0x66,0x0f,0x28,0x00]
-; X86-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
-; X86-SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovapd (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x00]
-; X86-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00]
-; X86-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
-; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movapd (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x28,0x07]
-; X64-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
-; X64-SSE-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovapd (%rdi), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x07]
-; X64-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovapd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x07]
-; X64-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
-; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
-  %a1 = load <2 x double>, <2 x double>* %a0, align 16
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-
-
 define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_x86_sse2_ucomieq_sd:
 ; SSE:       ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse_partial_update.ll b/llvm/test/CodeGen/X86/sse_partial_update.ll
index d6929930844..db575d62380 100644
--- a/llvm/test/CodeGen/X86/sse_partial_update.ll
+++ b/llvm/test/CodeGen/X86/sse_partial_update.ll
@@ -54,9 +54,10 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {
 ; CHECK-LABEL: sqrtss:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    sqrtss %xmm0, %xmm0
-; CHECK-NEXT:    cvtss2sd %xmm0, %xmm2
+; CHECK-NEXT:    sqrtss %xmm0, %xmm1
+; CHECK-NEXT:    cvtss2sd %xmm1, %xmm2
 ; CHECK-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 ; CHECK-NEXT:    jmp _callee ## TAILCALL
@@ -75,9 +76,10 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
 ; CHECK-LABEL: sqrtsd:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    sqrtsd %xmm0, %xmm0
-; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm2
+; CHECK-NEXT:    sqrtsd %xmm0, %xmm1
+; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm2
 ; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm2, %xmm0
 ; CHECK-NEXT:    jmp _callee2 ## TAILCALL