summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/X86
diff options
context:
space:
mode:
authorFilipe Cabecinhas <me@filcab.net>2014-05-19 19:45:57 +0000
committerFilipe Cabecinhas <me@filcab.net>2014-05-19 19:45:57 +0000
commitdc9210276637bf67091ff8885f0c2c0e08fadbcb (patch)
tree84c9e171289e944f35f147ef6666ebeb2f76e963 /llvm/test/CodeGen/X86
parent9f7d14756df13b907e29b61160b1cb6981013bb3 (diff)
downloadbcm5719-llvm-dc9210276637bf67091ff8885f0c2c0e08fadbcb.tar.gz
bcm5719-llvm-dc9210276637bf67091ff8885f0c2c0e08fadbcb.zip
Added more insertps optimizations
Summary: When inserting an element that's coming from a vector load or a broadcast of a vector (or scalar) load, combine the load into the insertps instruction. Added PerformINSERTPSCombine for the case where we need to fix the load (load of a vector + insertps with a non-zero CountS). Added patterns for the broadcasts. Also added tests for SSE4.1, AVX, and AVX2. Reviewers: delena, nadav, craig.topper Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3581 llvm-svn: 209156
Diffstat (limited to 'llvm/test/CodeGen/X86')
-rw-r--r--llvm/test/CodeGen/X86/avx.ll113
-rw-r--r--llvm/test/CodeGen/X86/fold-load-vec.ll2
-rw-r--r--llvm/test/CodeGen/X86/sse41.ll108
3 files changed, 221 insertions, 2 deletions
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index 208e3844f11..6069c14f0d8 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: @blendvb_fallback_v4i32
@@ -23,3 +24,113 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
%ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
ret <8 x float> %ret
}
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %pb, align 16
+ %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+ ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %pb, align 16
+ %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+ ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; X32: movl 8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+ %2 = load <4 x float>* %1, align 16
+ %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+ ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl 8(%esp), %eax
+; X32: movl 4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds float* %fb, i64 %index
+ %2 = load float* %1, align 4
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl 4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %b, align 4
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl 8(%esp), %eax
+; X32: movl 4(%esp), %ecx
+; CHECK: vbroadcastss
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds float* %fb, i64 %index
+ %2 = load float* %1, align 4
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+ %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+ %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+ %11 = fadd <4 x float> %7, %8
+ %12 = fadd <4 x float> %9, %10
+ %13 = fadd <4 x float> %11, %12
+ ret <4 x float> %13
+}
diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll
index e85d8f78c05..96c5be4f752 100644
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -5,7 +5,7 @@
; loads from m32.
define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
; CHECK: sample_test
-; CHECK: movaps
+; CHECK-NOT: movaps
; CHECK: insertps
entry:
%source.addr = alloca <4 x float>*, align 8
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 3652d8c0d02..a3c62016c43 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -584,3 +584,111 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
%ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
ret <8 x i16> %ret
}
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %pb, align 16
+ %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+ ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %pb, align 16
+ %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+ ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl 4(%esp), %eax
+; X32: movl 8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+ %2 = load <4 x float>* %1, align 16
+ %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+ ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl 8(%esp), %eax
+; X32: movl 4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds float* %fb, i64 %index
+ %2 = load float* %1, align 4
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl 4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK-NEXT: ret
+ %1 = load <4 x float>* %b, align 4
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl 8(%esp), %eax
+; X32: movl 4(%esp), %ecx
+; CHECK: movss
+; CHECK-NOT: mov
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: insertps $48
+; CHECK: addps
+; CHECK: addps
+; CHECK: addps
+; CHECK-NEXT: ret
+ %1 = getelementptr inbounds float* %fb, i64 %index
+ %2 = load float* %1, align 4
+ %3 = insertelement <4 x float> undef, float %2, i32 0
+ %4 = insertelement <4 x float> %3, float %2, i32 1
+ %5 = insertelement <4 x float> %4, float %2, i32 2
+ %6 = insertelement <4 x float> %5, float %2, i32 3
+ %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+ %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+ %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+ %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+ %11 = fadd <4 x float> %7, %8
+ %12 = fadd <4 x float> %9, %10
+ %13 = fadd <4 x float> %11, %12
+ ret <4 x float> %13
+}
OpenPOWER on IntegriCloud