diff options
| author | Filipe Cabecinhas <me@filcab.net> | 2014-05-19 19:45:57 +0000 |
|---|---|---|
| committer | Filipe Cabecinhas <me@filcab.net> | 2014-05-19 19:45:57 +0000 |
| commit | dc9210276637bf67091ff8885f0c2c0e08fadbcb (patch) | |
| tree | 84c9e171289e944f35f147ef6666ebeb2f76e963 /llvm/test/CodeGen/X86 | |
| parent | 9f7d14756df13b907e29b61160b1cb6981013bb3 (diff) | |
| download | bcm5719-llvm-dc9210276637bf67091ff8885f0c2c0e08fadbcb.tar.gz bcm5719-llvm-dc9210276637bf67091ff8885f0c2c0e08fadbcb.zip | |
Added more insertps optimizations
Summary:
When inserting an element that's coming from a vector load or a broadcast
of a vector (or scalar) load, combine the load into the insertps
instruction.
Added PerformINSERTPSCombine for the case where we need to fix the load
(load of a vector + insertps with a non-zero CountS).
Added patterns for the broadcasts.
Also added tests for SSE4.1, AVX, and AVX2.
Reviewers: delena, nadav, craig.topper
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D3581
llvm-svn: 209156
Diffstat (limited to 'llvm/test/CodeGen/X86')
| -rw-r--r-- | llvm/test/CodeGen/X86/avx.ll | 113 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/fold-load-vec.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse41.ll | 108 |
3 files changed, 221 insertions, 2 deletions
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll index 208e3844f11..6069c14f0d8 100644 --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @blendvb_fallback_v4i32 @@ -23,3 +24,113 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y ret <8 x float> %ret } + +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone + +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +;; Use a non-zero CountS for insertps +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load_offset: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $96, 4(%{{...}}), % +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) + ret <4 x float> %2 +} + +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; CHECK-LABEL: insertps_from_vector_load_offset_2: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; X32: movl 8(%esp), %ecx +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK-NEXT: ret + %1 = getelementptr inbounds <4 x float>* %pb, i64 %index + %2 = load <4 x float>* %1, align 16 + %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) + ret <4 x float> %3 +} + +define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_loadf32: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { +; CHECK-LABEL: insertps_from_broadcast_loadv4f32: +; On X32, account for the arguments' move to registers +; X32: movl 4(%esp), %{{...}} +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %b, align 4 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +;; FIXME: We're emitting an extraneous pshufd/vbroadcast. +define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_multiple_use: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK: vbroadcastss +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: vaddps +; CHECK: vaddps +; CHECK: vaddps +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) + %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) + %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) + %11 = fadd <4 x float> %7, %8 + %12 = fadd <4 x float> %9, %10 + %13 = fadd <4 x float> %11, %12 + ret <4 x float> %13 +} diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll index e85d8f78c05..96c5be4f752 100644 --- a/llvm/test/CodeGen/X86/fold-load-vec.ll +++ b/llvm/test/CodeGen/X86/fold-load-vec.ll @@ -5,7 +5,7 @@ ; loads from m32. define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind { ; CHECK: sample_test -; CHECK: movaps +; CHECK-NOT: movaps ; CHECK: insertps entry: %source.addr = alloca <4 x float>*, align 8 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 3652d8c0d02..a3c62016c43 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -584,3 +584,111 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y ret <8 x i16> %ret } + +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +;; Use a non-zero CountS for insertps +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; CHECK-LABEL: insertps_from_vector_load_offset: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $96, 4(%{{...}}), % +; CHECK-NEXT: ret + %1 = load <4 x float>* %pb, align 16 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) + ret <4 x float> %2 +} + +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; CHECK-LABEL: insertps_from_vector_load_offset_2: +; On X32, account for the argument's move to registers +; X32: movl 4(%esp), %eax +; X32: movl 8(%esp), %ecx +; CHECK-NOT: mov +;; Try to match a bit more of the instr, since we need the load's offset. +; CHECK: insertps $192, 12(%{{...}},%{{...}}), % +; CHECK-NEXT: ret + %1 = getelementptr inbounds <4 x float>* %pb, i64 %index + %2 = load <4 x float>* %1, align 16 + %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) + ret <4 x float> %3 +} + +define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_loadf32: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { +; CHECK-LABEL: insertps_from_broadcast_loadv4f32: +; On X32, account for the arguments' move to registers +; X32: movl 4(%esp), %{{...}} +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK-NEXT: ret + %1 = load <4 x float>* %b, align 4 + %2 = extractelement <4 x float> %1, i32 0 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +;; FIXME: We're emitting an extraneous pshufd/vbroadcast. +define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { +; CHECK-LABEL: insertps_from_broadcast_multiple_use: +; On X32, account for the arguments' move to registers +; X32: movl 8(%esp), %eax +; X32: movl 4(%esp), %ecx +; CHECK: movss +; CHECK-NOT: mov +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: insertps $48 +; CHECK: addps +; CHECK: addps +; CHECK: addps +; CHECK-NEXT: ret + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) + %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) + %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) + %11 = fadd <4 x float> %7, %8 + %12 = fadd <4 x float> %9, %10 + %13 = fadd <4 x float> %11, %12 + ret <4 x float> %13 +} |

