diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-11-04 20:48:09 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2015-11-04 20:48:09 +0000 |
| commit | 7e6606f4f1ee5e6a9cddced7c8572d6f55cf8cbe (patch) | |
| tree | 4c28a457ee55378cd41bfcca952217ca466be7f2 /llvm/test/CodeGen | |
| parent | b11b440f8e354b858579af454e72988a3e5d768d (diff) | |
| download | bcm5719-llvm-7e6606f4f1ee5e6a9cddced7c8572d6f55cf8cbe.tar.gz bcm5719-llvm-7e6606f4f1ee5e6a9cddced7c8572d6f55cf8cbe.zip | |
[X86][SSE] Add general memory folding for (V)INSERTPS instruction
This patch improves the memory folding of the inserted float element for the (V)INSERTPS instruction.
The existing implementation occurs in the DAGCombiner and relies on the narrowing of a whole vector load into a scalar load (and then converted into a vector) to (hopefully) allow folding to occur later on. Not only has this proven problematic for debug builds, it also prevents other memory folds (notably stack reloads) from happening.
This patch removes the old implementation and moves the folding code to the X86 foldMemoryOperand handler. A new private 'special case' function - foldMemoryOperandCustom - has been added to deal with memory folding of instructions that can't just use the lookup tables - (V)INSERTPS is the first of several that could be done.
It also tweaks the memory operand folding code with an additional pointer offset that allows existing memory addresses to be modified, in this case to convert the vector address to the explicit address of the scalar element that will be inserted.
Unlike the previous implementation we now set the insertion source index to zero, although this is ignored for the (V)INSERTPSrm version, anything that relied on shuffle decodes (such as unfolding of insertps loads) was incorrectly calculating the source address - I've added a test for this at insertps-unfold-load-bug.ll
Differential Revision: http://reviews.llvm.org/D13988
llvm-svn: 252074
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/X86/avx.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll | 33 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse41.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll | 10 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll | 10 |
5 files changed, 62 insertions, 13 deletions
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll index f71ec5c10e6..341dd867e4f 100644 --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -32,7 +32,7 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap ; On X32, account for the argument's move to registers ; X32: movl 4(%esp), %eax ; CHECK-NOT: mov -; CHECK: insertps $48 +; CHECK: vinsertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) @@ -46,7 +46,7 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float> ; X32: movl 4(%esp), %eax ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: insertps $96, 4(%{{...}}), % +; CHECK: vinsertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) @@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa ; X32: movl 8(%esp), %ecx ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK: vinsertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; CHECK-NEXT: ret %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index %2 = load <4 x float>, <4 x float>* %1, align 16 diff --git a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll new file mode 100644 index 00000000000..bf7c4bc4d7b --- /dev/null +++ b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X32 +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X64 + +; Test for case where insertps was folding the load of the insertion element, but a later optimization +; was then manipulating the load. + +define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) { +; X32-LABEL: insertps_unfold: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movaps (%eax), %xmm0 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X32-NEXT: addps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_unfold: +; X64: # BB#0: +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: retq + %a = getelementptr inbounds <4 x float>, <4 x float>* %v1, i64 0, i64 1 + %b = load float, float* %a, align 4 + %c = insertelement <4 x float> undef, float %b, i32 0 + %d = load <4 x float>, <4 x float>* %v1, align 16 + %e = load <4 x float>, <4 x float>* %v0, align 16 + %f = shufflevector <4 x float> %e, <4 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 5> + %g = fadd <4 x float> %c, %f + ret <4 x float> %g +} diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index d624c8dcbb4..0a83a9753b8 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -794,12 +794,12 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap ; X32-LABEL: insertps_from_vector_load: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) @@ -812,12 +812,12 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float> ; X32-LABEL: insertps_from_vector_load_offset: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] +; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load_offset: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] +; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) @@ -831,13 +831,13 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $4, %ecx -; X32-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] +; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load_offset_2: ; X64: ## BB#0: ; X64-NEXT: shlq $4, %rsi -; X64-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] +; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; X64-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index %2 = load <4 x float>, <4 x float>* %1, align 16 @@ -968,12 +968,12 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { ; X32-LABEL: pr20087: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: pr20087: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X64-NEXT: retq %load = load <4 x float> , <4 x float> *%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll index c7a2143b5b2..b86ec0ea22f 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -946,7 +946,15 @@ define <8 x float> @stack_fold_insertf128(<4 x float> %a0, <4 x float> %a1) { ret <8 x float> %2 } -; TODO stack_fold_insertps +define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_insertps + ;CHECK: vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_maxpd diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll index e90a89d36c0..105115bc7d2 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -637,7 +637,15 @@ define <4 x float> @stack_fold_hsubps(<4 x float> %a0, <4 x float> %a1) { } declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone -; TODO stack_fold_insertps +define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_insertps + ;CHECK: insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_maxpd |

