4 files changed, 47 insertions, 107 deletions
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 8d04c16879a..a5ad7ba2616 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,5 +1,41 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
 
+; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. 
+
+define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
+
+define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
+  ret <8 x i32> %res
+}
+
+; Verify that high bits of the immediate are masked off. This should be the equivalent
+; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
+; not a vinsertf128 $1.
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: 
+  ; CHECK-NOT:   vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
+
 define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vblendpd
   %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
index 3716cf84989..96d80ea7ae6 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 
-define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
-
-
 define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vperm2f128
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
index d0f8f4ebaea..38389de7a8a 100644
--- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
 
+; CHECK-LABEL: A:
 ; CHECK-NOT: vunpck
 ; CHECK: vinsertf128 $1
 define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
@@ -9,6 +9,7 @@ entry:
   ret <8 x float> %shuffle
 }
 
+; CHECK-LABEL: B:
 ; CHECK-NOT: vunpck
 ; CHECK: vinsertf128 $1
 define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
@@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
 ; Just check that no crash happens
-; CHECK-SSE: _insert_crash
+; CHECK-LABEL: _insert_crash:
 define void @insert_crash() nounwind {
 allocas:
   %v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -39,7 +40,7 @@ allocas:
 
 ;; DAG Combine must remove useless vinsertf128 instructions
 
-; CHECK: DAGCombineA
+; CHECK-LABEL: DAGCombineA:
 ; CHECK-NOT: vinsertf128 $1
 define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
   %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -47,7 +48,7 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
   ret <4 x i32> %2
 }
 
-; CHECK: DAGCombineB
+; CHECK-LABEL: DAGCombineB:
 ; CHECK: vpaddd %xmm
 ; CHECK-NOT: vinsertf128  $1
 ; CHECK: vpaddd %xmm
@@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
   ret <8 x i32> %2
 }
 
-; CHECK: insert_pd
-define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) {
-; CHECK: vinsertf128
-%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0)
-ret <4 x double> %res
-}
-
-; CHECK: insert_undef_pd
+; CHECK-LABEL: insert_undef_pd:
 define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
@@ -73,14 +67,7 @@ ret <4 x double> %res
 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
 
 
-; CHECK: insert_ps
-define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0)
-ret <8 x float> %res
-}
-
-; CHECK: insert_undef_ps
+; CHECK-LABEL: insert_undef_ps:
 define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
@@ -89,14 +76,7 @@ ret <8 x float> %res
 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
 
 
-; CHECK: insert_si
-define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0)
-ret <8 x i32> %res
-}
-
-; CHECK: insert_undef_si
+; CHECK-LABEL: insert_undef_si:
 define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
@@ -105,7 +85,7 @@ ret <8 x i32> %res
 declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
 
 ; rdar://10643481
-; CHECK: vinsertf128_combine
+; CHECK-LABEL: vinsertf128_combine:
 define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
 ; CHECK-NOT: vmovaps
 ; CHECK: vinsertf128
@@ -118,7 +98,7 @@ entry:
 }
 
 ; rdar://11076953
-; CHECK: vinsertf128_ucombine
+; CHECK-LABEL: vinsertf128_ucombine:
 define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
 ; CHECK-NOT: vmovups
 ; CHECK: vinsertf128
diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
index a44d44d1b69..b337a80b84b 100644
--- a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
 ; if it's faster.
 
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
-
-; Use the vinsertf128 intrinsic to model source code 
-; that explicitly uses AVX intrinsics.
-define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
-  ; CHECK-LABEL: combine_16_byte_loads
-
-  ; SANDYB: vmovups
-  ; SANDYB-NEXT: vinsertf128
-  ; SANDYB-NEXT: retq
-
-  ; BTVER2: vmovups
-  ; BTVER2-NEXT: retq
-
-  ; HASWELL: vmovups
-  ; HASWELL-NEXT: retq
-
-  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
-  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
-  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
-  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
-  %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
-  ret <8 x float> %v3
-}
-
-; Swap the operands of the shufflevector and vinsertf128 to ensure that the
-; pattern still matches.
-define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
-  ; CHECK-LABEL: combine_16_byte_loads_swap
-
-  ; SANDYB: vmovups
-  ; SANDYB-NEXT: vinsertf128
-  ; SANDYB-NEXT: retq
-
-  ; BTVER2: vmovups
-  ; BTVER2-NEXT: retq
-
-  ; HASWELL: vmovups
-  ; HASWELL-NEXT: retq
-
-  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
-  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
-  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
-  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
-  %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
-  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
-  ret <8 x float> %v3
-}
-
-; Replace the vinsertf128 intrinsic with a shufflevector as might be
-; expected from auto-vectorized code.
 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic