From b078350872693f37726e78caa1c413dd736cff4e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nadav.rotem@intel.com>
Date: Sun, 1 Apr 2012 19:31:22 +0000
Subject: This commit contains a few changes that had to go in together.

1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
   (and also scalar_to_vector).

2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src).
   Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B))

3. Optimize swizzles of shuffles:  shuff(shuff(x, y), undef) -> shuff(x, y).

4. Fix an X86ISelLowering optimization which was very bitcast-sensitive.

Code which was previously compiled to this:

movd    (%rsi), %xmm0
movdqa  .LCPI0_0(%rip), %xmm2
pshufb  %xmm2, %xmm0
movd    (%rdi), %xmm1
pshufb  %xmm2, %xmm1
pxor    %xmm0, %xmm1
pshufb  .LCPI0_1(%rip), %xmm1
movd    %xmm1, (%rdi)
ret

Now compiles to this:

movl    (%rsi), %eax
xorl    %eax, (%rdi)
ret

llvm-svn: 153848
---
 llvm/test/CodeGen/ARM/reg_sequence.ll      |  2 +-
 llvm/test/CodeGen/CellSPU/rotate_ops.ll    |  2 +-
 llvm/test/CodeGen/X86/2011-10-27-tstore.ll | 12 ++++++------
 llvm/test/CodeGen/X86/SwizzleShuff.ll      | 14 ++++++++++++++
 llvm/test/CodeGen/X86/vec_shuffle-37.ll    | 10 +++++-----
 llvm/test/CodeGen/X86/widen_shuffle-1.ll   |  2 +-
 6 files changed, 28 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/SwizzleShuff.ll

(limited to 'llvm/test/CodeGen')
diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll
index 78b4e7ea84c..05794e4ebdd 100644
--- a/llvm/test/CodeGen/ARM/reg_sequence.ll
+++ b/llvm/test/CodeGen/ARM/reg_sequence.ll
@@ -273,7 +273,7 @@ define arm_aapcs_vfpcc i32 @t10() nounwind {
 entry:
 ; CHECK: t10:
 ; CHECK: vmov.i32 q[[Q0:[0-9]+]], #0x3f000000
-; CHECK: vmul.f32 q8, q8, d0[0]
+; CHECK: vmul.f32 q8, q8, d[[DREG:[0-1]+]]
 ; CHECK: vadd.f32 q8, q8, q8
   %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
   %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
diff --git a/llvm/test/CodeGen/CellSPU/rotate_ops.ll b/llvm/test/CodeGen/CellSPU/rotate_ops.ll
index 97709352760..8b7af20b4a9 100644
--- a/llvm/test/CodeGen/CellSPU/rotate_ops.ll
+++ b/llvm/test/CodeGen/CellSPU/rotate_ops.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=cellspu -o %t1.s
-; RUN: grep rot          %t1.s | count 86
+; RUN: grep rot          %t1.s | count 85
 ; RUN: grep roth         %t1.s | count 8
 ; RUN: grep roti.*5      %t1.s | count 1
 ; RUN: grep roti.*27     %t1.s | count 1
diff --git a/llvm/test/CodeGen/X86/2011-10-27-tstore.ll b/llvm/test/CodeGen/X86/2011-10-27-tstore.ll
index 016e02c3d5d..1712f345653 100644
--- a/llvm/test/CodeGen/X86/2011-10-27-tstore.ll
+++ b/llvm/test/CodeGen/X86/2011-10-27-tstore.ll
@@ -3,14 +3,14 @@
 target triple = "x86_64-unknown-linux-gnu"
 
 ;CHECK: ltstore
-;CHECK: pshufd
-;CHECK: pshufd
-;CHECK: ret
-define void @ltstore() {
+;CHECK: movq
+;CHECK-NEXT: movq
+;CHECK-NEXT: ret
+define void @ltstore(<4 x i32>* %pIn, <2 x i32>* %pOut) {
 entry:
-  %in = load <4 x i32>* undef
+  %in = load <4 x i32>* %pIn
   %j = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  store <2 x i32> %j, <2 x i32>* undef
+  store <2 x i32> %j, <2 x i32>* %pOut
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll
new file mode 100644
index 00000000000..11b702e3d1b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+; Check that we perform a scalar XOR on i32.
+
+; CHECK: pull_bitcast
+; CHECK: xorl
+; CHECK: ret
+define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {
+  %A = load <4 x i8>* %pA
+  %B = load <4 x i8>* %pB
+  %C = xor <4 x i8> %A, %B
+  store <4 x i8> %C, <4 x i8>* %pA
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/vec_shuffle-37.ll b/llvm/test/CodeGen/X86/vec_shuffle-37.ll
index 06083989382..65486cb80c9 100644
--- a/llvm/test/CodeGen/X86/vec_shuffle-37.ll
+++ b/llvm/test/CodeGen/X86/vec_shuffle-37.ll
@@ -27,11 +27,11 @@ entry:
 define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
 entry:
 ; CHECK: t02
-; CHECK: movaps
-; CHECK: shufps
-; CHECK: pshufd
-; CHECK: movq
-; CHECK: ret
+; CHECK: mov
+; CHECK-NEXT: mov
+; CHECK-NEXT: mov
+; CHECK-NEXT: mov
+; CHECK-NEXT: ret
   %0 = bitcast <8 x i32>* %source to <4 x i32>*
   %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
   %tmp2 = load <4 x i32>* %arrayidx, align 16
diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
index 7bebb274f6e..94200537168 100644
--- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
@@ -33,7 +33,7 @@ entry:
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
 entry:
 ; CHECK: shuf3:
-; CHECK: shufps
+; CHECK: shufd
   %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 
   %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-- 
cgit v1.2.3