[LegalizeVectorTypes] Allow single loads and stores for more short vectors

When lowering a load or store for TypeWidenVector, the type legalizer would use a single load or store if the associated integer type was legal or promoted. E.g. it loads a v4i8 as an i32 if i32 is legal/promotable. (See https://reviews.llvm.org/rL236528 for reference.) This applies that behaviour to vector types. If the vector type is TypePromoteInteger, the element type is going to be TypePromoteInteger as well, which will lead to have a single promoting load rather than N individual promoting loads. For instance, if we have a v3i1, we would now have a load of v4i1 instead of 3 loads of i1. Patch by Guillaume Marques. Thanks! Differential Revision: https://reviews.llvm.org/D56201 llvm-svn: 357120
author: Justin Bogner <mail@justinbogner.com> 2019-03-27 20:35:56 +0000
committer: Justin Bogner <mail@justinbogner.com> 2019-03-27 20:35:56 +0000
commit: b1650f0da92bc9256627a1a692f847c6e1b1d210 (patch)
tree: c9fe46d35b9eb80d5b4434b2309cdcd766da0916 /llvm/test/CodeGen/X86
parent: ee9f2ae5b913cf571997091c4d7cac99eccd29a0 (diff)
download: bcm5719-llvm-b1650f0da92bc9256627a1a692f847c6e1b1d210.tar.gz
bcm5719-llvm-b1650f0da92bc9256627a1a692f847c6e1b1d210.zip
5 files changed, 84 insertions, 29 deletions
diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll
new file mode 100644
index 00000000000..88b87c273e8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s
+
+; widen a v3i1 to v4i1 to do a vector load/store. We would previously
+; reconstruct the said v3i1 from the first element of the vector by filling all
+; the lanes of the vector with that first element, which was obviously wrong.
+; This was done in the type-legalizing of the DAG, when legalizing the load.
+
+; Function Attrs: argmemonly nounwind readonly
+declare <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)*, i32, <3 x i1>, <3 x i32>)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.masked.store.v3i32.p1v3i32(<3 x i32>, <3 x i32> addrspace(1)*, i32, <3 x i1>)
+
+define  <3 x i32> @masked_load_v3(i32 addrspace(1)*, <3 x i1>) {
+entry:
+  %2 = bitcast i32 addrspace(1)* %0 to <3 x i32> addrspace(1)*
+  %3 = call <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)* %2, i32 4, <3 x i1> %1, <3 x i32> undef)
+  ret <3 x i32> %3
+}
+
+define void @masked_store4_v3(<3 x i32>, i32 addrspace(1)*, <3 x i1>) {
+entry:
+  %3 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)*
+  call void @llvm.masked.store.v3i32.p1v3i32(<3 x i32> %0, <3 x i32> addrspace(1)* %3, i32 4, <3 x i1> %2)
+  ret void
+}
+
+define void @local_load_v3i1(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <3 x i1>* %predicate_ptr) nounwind {
+; CHECK-LABEL: local_load_v3i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT: pushq   %rbp
+; CHECK-NEXT: pushq   %r15
+; CHECK-NEXT: pushq   %r14
+; CHECK-NEXT: pushq   %rbx
+; CHECK-NEXT: pushq   %rax
+; CHECK-NEXT: movq    %rdi, %r14
+; CHECK-NEXT: movzbl  (%rdx), %ebp
+; CHECK-NEXT: movl    %ebp, %eax
+; CHECK-NEXT: shrl    %eax
+; CHECK-NEXT: andl    $1, %eax
+; CHECK-NEXT: movl    %ebp, %ecx
+; CHECK-NEXT: andl    $1, %ecx
+; CHECK-NEXT: movd    %ecx, %xmm0
+; CHECK-NEXT: pinsrd  $1, %eax, %xmm0
+; CHECK-NEXT: shrl    $2, %ebp
+; CHECK-NEXT: andl    $1, %ebp
+; CHECK-NEXT: pinsrd  $2, %ebp, %xmm0
+; CHECK-NEXT: movd    %xmm0, %ebx
+; CHECK-NEXT: pextrd  $1, %xmm0, %r15d
+; CHECK-NEXT: movq    %rsi, %rdi
+; CHECK-NEXT: movl    %ebx, %esi
+; CHECK-NEXT: movl    %r15d, %edx
+; CHECK-NEXT: movl    %ebp, %ecx
+; CHECK-NEXT: callq   masked_load_v3
+; CHECK-NEXT: movq    %r14, %rdi
+; CHECK-NEXT: movl    %ebx, %esi
+; CHECK-NEXT: movl    %r15d, %edx
+; CHECK-NEXT: movl    %ebp, %ecx
+; CHECK-NEXT: callq   masked_store4_v3
+; CHECK-NEXT: addq    $8, %rsp
+; CHECK-NEXT: popq    %rbx
+; CHECK-NEXT: popq    %r14
+; CHECK-NEXT: popq    %r15
+; CHECK-NEXT: popq    %rbp
+; CHECK-NEXT: retq
+  %predicate = load <3 x i1>, <3 x i1>* %predicate_ptr
+  %load1 = call <3 x i32> @masked_load_v3(i32 addrspace(1)* %in, <3 x i1> %predicate)
+  call void @masked_store4_v3(<3 x i32> %load1, i32 addrspace(1)* %out, <3 x i1> %predicate)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/widen_arith-3.ll b/llvm/test/CodeGen/X86/widen_arith-3.ll
index aa656de2342..3e455f7f14c 100644
--- a/llvm/test/CodeGen/X86/widen_arith-3.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-3.ll
@@ -12,7 +12,7 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-8, %esp
-; CHECK-NEXT:    subl $40, %esp
+; CHECK-NEXT:    subl $32, %esp
 ; CHECK-NEXT:    movl {{\.LCPI.*}}, %eax
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
@@ -26,9 +26,7 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl 12(%ebp), %edx
 ; CHECK-NEXT:    movl 8(%ebp), %ecx
-; CHECK-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT:    pinsrd $2, 4(%edx,%eax,8), %xmm2
+; CHECK-NEXT:    pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; CHECK-NEXT:    psubd %xmm0, %xmm2
 ; CHECK-NEXT:    pextrw $4, %xmm2, 4(%ecx,%eax,8)
 ; CHECK-NEXT:    pshufb %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/widen_cast-2.ll b/llvm/test/CodeGen/X86/widen_cast-2.ll
index e7780912cd9..0bbcd391d22 100644
--- a/llvm/test/CodeGen/X86/widen_cast-2.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-2.ll
@@ -21,9 +21,8 @@ define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind {
 ; CHECK-NEXT:    movdqa 16(%edx,%eax), %xmm2
 ; CHECK-NEXT:    psubw %xmm0, %xmm1
 ; CHECK-NEXT:    psubw %xmm0, %xmm2
-; CHECK-NEXT:    movd %xmm2, 16(%ecx,%eax)
-; CHECK-NEXT:    pextrd $1, %xmm2, 20(%ecx,%eax)
 ; CHECK-NEXT:    pextrd $2, %xmm2, 24(%ecx,%eax)
+; CHECK-NEXT:    movq %xmm2, 16(%ecx,%eax)
 ; CHECK-NEXT:    movdqa %xmm1, (%ecx,%eax)
 ; CHECK-NEXT:    incl (%esp)
 ; CHECK-NEXT:    cmpl $3, (%esp)
diff --git a/llvm/test/CodeGen/X86/widen_cast-3.ll b/llvm/test/CodeGen/X86/widen_cast-3.ll
index 18a04c48a59..a4d37823dfc 100644
--- a/llvm/test/CodeGen/X86/widen_cast-3.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-3.ll
@@ -11,8 +11,7 @@ define void @convert(<12 x i8>* %dst.addr, <3 x i32> %src) nounwind {
 ; X86-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-NEXT:    psubd %xmm1, %xmm0
 ; X86-NEXT:    pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT:    pextrd $1, %xmm0, 4(%eax)
-; X86-NEXT:    movd %xmm0, (%eax)
+; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: convert:
diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll
index 23b68b26980..ea8f4ff0528 100644
--- a/llvm/test/CodeGen/X86/widen_load-2.ll
+++ b/llvm/test/CodeGen/X86/widen_load-2.ll
@@ -15,8 +15,7 @@ define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
 ; X86-NEXT:    movdqa (%edx), %xmm0
 ; X86-NEXT:    paddd (%ecx), %xmm0
 ; X86-NEXT:    pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT:    pextrd $1, %xmm0, 4(%eax)
-; X86-NEXT:    movd %xmm0, (%eax)
+; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add3i32:
@@ -40,16 +39,13 @@ define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    pinsrd $1, 4(%edx), %xmm0
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    pinsrd $2, 8(%edx), %xmm0
-; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    pinsrd $1, 4(%ecx), %xmm1
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    pinsrd $2, 8(%ecx), %xmm1
 ; X86-NEXT:    paddd %xmm0, %xmm1
-; X86-NEXT:    pextrd $1, %xmm1, 4(%eax)
+; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    pextrd $2, %xmm1, 8(%eax)
-; X86-NEXT:    movd %xmm1, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add3i32_2:
@@ -81,9 +77,8 @@ define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
 ; X86-NEXT:    movdqa 16(%edx), %xmm1
 ; X86-NEXT:    paddd (%ecx), %xmm0
 ; X86-NEXT:    paddd 16(%ecx), %xmm1
-; X86-NEXT:    movd %xmm1, 16(%eax)
-; X86-NEXT:    pextrd $1, %xmm1, 20(%eax)
 ; X86-NEXT:    pextrd $2, %xmm1, 24(%eax)
+; X86-NEXT:    movq %xmm1, 16(%eax)
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl $4
 ;
@@ -151,16 +146,12 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 16(%ebp), %ecx
 ; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-NEXT:    pinsrd $2, 4(%edx), %xmm0
-; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-NEXT:    pinsrd $2, 4(%ecx), %xmm1
+; X86-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-NEXT:    paddd %xmm0, %xmm1
 ; X86-NEXT:    pextrw $4, %xmm1, 4(%eax)
 ; X86-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -225,8 +216,7 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
 ; X86-NEXT:    movdqa 16(%edx), %xmm1
 ; X86-NEXT:    paddw (%ecx), %xmm0
 ; X86-NEXT:    paddw 16(%ecx), %xmm1
-; X86-NEXT:    movd %xmm1, 16(%eax)
-; X86-NEXT:    pextrd $1, %xmm1, 20(%eax)
+; X86-NEXT:    movq %xmm1, 16(%eax)
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl $4
 ;
@@ -331,11 +321,10 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; X86-NEXT:    movdqa 16(%edx), %xmm1
 ; X86-NEXT:    paddb (%ecx), %xmm0
 ; X86-NEXT:    paddb 16(%ecx), %xmm1
-; X86-NEXT:    movd %xmm1, 16(%eax)
-; X86-NEXT:    pextrd $1, %xmm1, 20(%eax)
 ; X86-NEXT:    pextrd $2, %xmm1, 24(%eax)
 ; X86-NEXT:    pextrw $6, %xmm1, 28(%eax)
 ; X86-NEXT:    pextrb $14, %xmm1, 30(%eax)
+; X86-NEXT:    movq %xmm1, 16(%eax)
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl $4
 ;
author	Justin Bogner <mail@justinbogner.com>	2019-03-27 20:35:56 +0000
committer	Justin Bogner <mail@justinbogner.com>	2019-03-27 20:35:56 +0000
commit	b1650f0da92bc9256627a1a692f847c6e1b1d210 (patch)
tree	c9fe46d35b9eb80d5b4434b2309cdcd766da0916 /llvm/test/CodeGen/X86
parent	ee9f2ae5b913cf571997091c4d7cac99eccd29a0 (diff)
download	bcm5719-llvm-b1650f0da92bc9256627a1a692f847c6e1b1d210.tar.gz bcm5719-llvm-b1650f0da92bc9256627a1a692f847c6e1b1d210.zip