One readme entry is done, one is really easy (Evan, want to investigate

eliminating the llvm.x86.sse2.loadl.pd intrinsic?), one shuffle optzn may be done (if shufps is better than pinsw, Evan, please review), and we already know about LICM of simple instructions. llvm-svn: 45407
author: Chris Lattner <sabre@nondot.org> 2007-12-29 19:31:47 +0000
committer: Chris Lattner <sabre@nondot.org> 2007-12-29 19:31:47 +0000
commit: d2b8a36f0eef1c1214f6090c789e689bf84379bb (patch)
tree: d0b4e63eebea08ad99f6a64a4b85c7a090f3906f /llvm
parent: 0d90c8f01699197b0a7252263fc9f0162647c776 (diff)
download: bcm5719-llvm-d2b8a36f0eef1c1214f6090c789e689bf84379bb.tar.gz
bcm5719-llvm-d2b8a36f0eef1c1214f6090c789e689bf84379bb.zip
2 files changed, 35 insertions, 77 deletions
diff --git a/llvm/lib/Target/X86/README-SSE.txt b/llvm/lib/Target/X86/README-SSE.txt
index b4fc53ad0aa..cadfc20bbb1 100644
--- a/llvm/lib/Target/X86/README-SSE.txt
+++ b/llvm/lib/Target/X86/README-SSE.txt
@@ -456,6 +456,18 @@ icc generates:
 So icc is smart enough to know that B is in memory so it doesn't load it and
 store it back to stack.
 
+This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic, 
+lowering it to a load+insertelement instead.  Already match the load+shuffle 
+as movlpd, so this should be easy.  We already get optimal code for:
+
+define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
+entry:
+	%tmp2 = load <2 x double>* %A, align 16
+	%tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
+	store <2 x double> %tmp8, <2 x double>* %r, align 16
+	ret void
+}
+
 //===---------------------------------------------------------------------===//
 
 __m128d test1( __m128d A, __m128d B) {
@@ -476,10 +488,10 @@ Don't know if unpckhpd is faster. But it is shorter.
 
 This code generates ugly code, probably due to costs being off or something:
 
-void %test(float* %P, <4 x float>* %P2 ) {
+define void @test(float* %P, <4 x float>* %P2 ) {
         %xFloat0.688 = load float* %P
-        %loadVector37.712 = load <4 x float>* %P2
-        %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
+        %tmp = load <4 x float>* %P2
+        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
         store <4 x float> %inFloat3.713, <4 x float>* %P2
         ret void
 }
@@ -487,17 +499,16 @@ void %test(float* %P, <4 x float>* %P2 ) {
 Generates:
 
 _test:
-        pxor %xmm0, %xmm0
-        movd %xmm0, %eax        ;; EAX = 0!
-        movl 8(%esp), %ecx
-        movaps (%ecx), %xmm0
-        pinsrw $6, %eax, %xmm0
-        shrl $16, %eax          ;; EAX = 0 again!
-        pinsrw $7, %eax, %xmm0
-        movaps %xmm0, (%ecx)
-        ret
+	movl	8(%esp), %eax
+	movaps	(%eax), %xmm0
+	pxor	%xmm1, %xmm1
+	movaps	%xmm0, %xmm2
+	shufps	$50, %xmm1, %xmm2
+	shufps	$132, %xmm2, %xmm0
+	movaps	%xmm0, (%eax)
+	ret
 
-It would be better to generate:
+Would it be better to generate:
 
 _test:
         movl 8(%esp), %ecx
@@ -508,7 +519,7 @@ _test:
         movaps %xmm0, (%ecx)
         ret
 
-or use pxor (to make a zero vector) and shuffle (to insert it).
+?
 
 //===---------------------------------------------------------------------===//
 
@@ -576,32 +587,6 @@ swizzle:
 
 //===---------------------------------------------------------------------===//
 
-This code:
-
-#include <emmintrin.h>
-__m128i test(long long i) { return _mm_cvtsi64x_si128(i); }
-
-Should turn into a single 'movq %rdi, %xmm0' instruction.  Instead, we 
-get this (on x86-64):
-
-_test:
-	movd %rdi, %xmm1
-	xorps %xmm0, %xmm0
-	movsd %xmm1, %xmm0
-	ret
-
-The LLVM IR is:
-
-target triple = "x86_64-apple-darwin8"
-define <2 x i64> @test(i64 %i) {
-entry:
-	%tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0	
-	%tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
-	ret <2 x i64> %tmp11
-}
-
-//===---------------------------------------------------------------------===//
-
 These functions should produce the same code:
 
 #include <emmintrin.h>
@@ -671,43 +656,6 @@ beneficial because it prevents the load from being folded into the multiply.
 
 //===---------------------------------------------------------------------===//
 
-In this loop:
-
-bb49:		; preds = %bb49, %bb49.preheader
-	%indvar = phi i32 [ 0, %bb49.preheader ], [ %indvar.next, %bb49 ]		; <i32> [#uses=2]
-	%dp.089.0.rec = shl i32 %indvar, 3		; <i32> [#uses=2]
-	%dp.089.0 = getelementptr i32* %tmp89, i32 %dp.089.0.rec		; <i32*> [#uses=1]
-	%tmp5051 = bitcast i32* %dp.089.0 to <2 x i64>*		; <<2 x i64>*> [#uses=1]
-	store <2 x i64> zeroinitializer, <2 x i64>* %tmp5051, align 16
-	%dp.089.0.sum105 = or i32 %dp.089.0.rec, 4		; <i32> [#uses=1]
-	%tmp56 = getelementptr i32* %tmp89, i32 %dp.089.0.sum105		; <i32*> [#uses=1]
-	%tmp5657 = bitcast i32* %tmp56 to <2 x i64>*		; <<2 x i64>*> [#uses=1]
-	store <2 x i64> zeroinitializer, <2 x i64>* %tmp5657, align 16
-	%indvar.next = add i32 %indvar, 1		; <i32> [#uses=2]
-	%exitcond = icmp eq i32 %indvar.next, %tmp98		; <i1> [#uses=1]
-	br i1 %exitcond, label %bb72, label %bb49
-
-we get:
-
-LBB5_6:	# bb49.preheader
-	shlw	$2, %si
-	decw	%si
-	movzwl	%si, %eax
-	incl	%eax
-	xorl	%ecx, %ecx
-LBB5_7:	# bb49
-	xorps	%xmm0, %xmm0            # (1)
-	movaps	%xmm0, (%edx)
-	movaps	%xmm0, 16(%edx)
-	addl	$32, %edx
-	incl	%ecx
-	cmpl	%eax, %ecx
-	jne	LBB4_7	# bb47
-
-The instruction at (1) can be moved out of the main body of the loop.
-
-//===---------------------------------------------------------------------===//
-
 These functions:
 
 #include <xmmintrin.h>
diff --git a/llvm/test/CodeGen/X86/vec_set-8.ll b/llvm/test/CodeGen/X86/vec_set-8.ll
new file mode 100644
index 00000000000..cca436bf643
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec_set-8.ll
@@ -0,0 +1,10 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | not grep movsd
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {movd.*%rdi,.*%xmm0}
+
+define <2 x i64> @test(i64 %i) nounwind  {
+entry:
+	%tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
+	%tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
+	ret <2 x i64> %tmp11
+}
+
author	Chris Lattner <sabre@nondot.org>	2007-12-29 19:31:47 +0000
committer	Chris Lattner <sabre@nondot.org>	2007-12-29 19:31:47 +0000
commit	d2b8a36f0eef1c1214f6090c789e689bf84379bb (patch)
tree	d0b4e63eebea08ad99f6a64a4b85c7a090f3906f /llvm
parent	0d90c8f01699197b0a7252263fc9f0162647c776 (diff)
download	bcm5719-llvm-d2b8a36f0eef1c1214f6090c789e689bf84379bb.tar.gz bcm5719-llvm-d2b8a36f0eef1c1214f6090c789e689bf84379bb.zip