[X86][SSE] Vectorized i8 and i16 shift operators

This patch ensures that SHL/SRL/SRA shifts for i8 and i16 vectors avoid scalarization. It builds on the existing i8 SHL vectorized implementation of moving the shift bits up to the sign bit position and separating the 4, 2 & 1 bit shifts with several improvements: 1 - SSE41 targets can use (v)pblendvb directly with the sign bit instead of performing a comparison to feed into a VSELECT node. 2 - pre-SSE41 targets were masking + comparing with an 0x80 constant - we avoid this by using the fact that a set sign bit means a negative integer which can be compared against zero to then feed into VSELECT, avoiding the need for a constant mask (zero generation is much cheaper). 3 - SRA i8 needs to be unpacked to the upper byte of a i16 so that the i16 psraw instruction can be correctly used for sign extension - we have to do more work than for SHL/SRL but perf tests indicate that this is still beneficial. The i16 implementation is similar but simpler than for i8 - we have to do 8, 4, 2 & 1 bit shifts but less shift masking is involved. SSE41 use of (v)pblendvb requires that the i16 shift amount is splatted to both bytes however. Tested on SSE2, SSE41 and AVX machines. Differential Revision: http://reviews.llvm.org/D9474 llvm-svn: 239509
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-06-11 07:46:37 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-06-11 07:46:37 +0000
commit: 5965680d533900065a4f4b14d8ea5f025bb44ce3 (patch)
tree: 09606be8248552a49355532b9f524fdc6f4f9d6f /llvm/test/Analysis
parent: 2e8ffa3b4496d124c542679f9367b22abdbc5daf (diff)
download: bcm5719-llvm-5965680d533900065a4f4b14d8ea5f025bb44ce3.tar.gz
bcm5719-llvm-5965680d533900065a4f4b14d8ea5f025bb44ce3.zip
3 files changed, 36 insertions, 36 deletions
diff --git a/llvm/test/Analysis/CostModel/X86/testshiftashr.ll b/llvm/test/Analysis/CostModel/X86/testshiftashr.ll
index d96a92fe2a8..ced2ffed455 100644
--- a/llvm/test/Analysis/CostModel/X86/testshiftashr.ll
+++ b/llvm/test/Analysis/CostModel/X86/testshiftashr.ll
@@ -29,9 +29,9 @@ entry:
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
   ; SSE2: shift8i16
-  ; SSE2: cost of 80 {{.*}} ashr
+  ; SSE2: cost of 32 {{.*}} ashr
   ; SSE2-CODEGEN: shift8i16
-  ; SSE2-CODEGEN: sarw %cl
+  ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype8i16 %a , %b
   ret %shifttype8i16 %0
@@ -41,9 +41,9 @@ entry:
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
   ; SSE2: shift16i16
-  ; SSE2: cost of 160 {{.*}} ashr
+  ; SSE2: cost of 64 {{.*}} ashr
   ; SSE2-CODEGEN: shift16i16
-  ; SSE2-CODEGEN: sarw %cl
+  ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype16i16 %a , %b
   ret %shifttype16i16 %0
@@ -53,9 +53,9 @@ entry:
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
   ; SSE2: shift32i16
-  ; SSE2: cost of 320 {{.*}} ashr
+  ; SSE2: cost of 128 {{.*}} ashr
   ; SSE2-CODEGEN: shift32i16
-  ; SSE2-CODEGEN: sarw %cl
+  ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype32i16 %a , %b
   ret %shifttype32i16 %0
@@ -209,9 +209,9 @@ entry:
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
   ; SSE2: shift8i8
-  ; SSE2: cost of 80 {{.*}} ashr
+  ; SSE2: cost of 32 {{.*}} ashr
   ; SSE2-CODEGEN: shift8i8
-  ; SSE2-CODEGEN: sarw %cl
+  ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype8i8 %a , %b
   ret %shifttype8i8 %0
@@ -221,9 +221,9 @@ entry:
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
   ; SSE2: shift16i8
-  ; SSE2: cost of 160 {{.*}} ashr
+  ; SSE2: cost of 54 {{.*}} ashr
   ; SSE2-CODEGEN: shift16i8
-  ; SSE2-CODEGEN: sarb %cl
+  ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype16i8 %a , %b
   ret %shifttype16i8 %0
@@ -233,9 +233,9 @@ entry:
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
   ; SSE2: shift32i8
-  ; SSE2: cost of 320 {{.*}} ashr
+  ; SSE2: cost of 108 {{.*}} ashr
   ; SSE2-CODEGEN: shift32i8
-  ; SSE2-CODEGEN: sarb %cl
+  ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype32i8 %a , %b
   ret %shifttype32i8 %0
diff --git a/llvm/test/Analysis/CostModel/X86/testshiftlshr.ll b/llvm/test/Analysis/CostModel/X86/testshiftlshr.ll
index 78bf0a60830..0bc60eacac9 100644
--- a/llvm/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/llvm/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -29,9 +29,9 @@ entry:
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
   ; SSE2: shift8i16
-  ; SSE2: cost of 80 {{.*}} lshr
+  ; SSE2: cost of 32 {{.*}} lshr
   ; SSE2-CODEGEN: shift8i16
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype8i16 %a , %b
   ret %shifttype8i16 %0
@@ -41,9 +41,9 @@ entry:
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
   ; SSE2: shift16i16
-  ; SSE2: cost of 160 {{.*}} lshr
+  ; SSE2: cost of 64 {{.*}} lshr
   ; SSE2-CODEGEN: shift16i16
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype16i16 %a , %b
   ret %shifttype16i16 %0
@@ -53,9 +53,9 @@ entry:
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
   ; SSE2: shift32i16
-  ; SSE2: cost of 320 {{.*}} lshr
+  ; SSE2: cost of 128 {{.*}} lshr
   ; SSE2-CODEGEN: shift32i16
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype32i16 %a , %b
   ret %shifttype32i16 %0
@@ -209,9 +209,9 @@ entry:
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
   ; SSE2: shift8i8
-  ; SSE2: cost of 80 {{.*}} lshr
+  ; SSE2: cost of 32 {{.*}} lshr
   ; SSE2-CODEGEN: shift8i8
-  ; SSE2-CODEGEN: shrl %cl
+  ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype8i8 %a , %b
   ret %shifttype8i8 %0
@@ -221,9 +221,9 @@ entry:
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
   ; SSE2: shift16i8
-  ; SSE2: cost of 160 {{.*}} lshr
+  ; SSE2: cost of 26 {{.*}} lshr
   ; SSE2-CODEGEN: shift16i8
-  ; SSE2-CODEGEN: shrb %cl
+  ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype16i8 %a , %b
   ret %shifttype16i8 %0
@@ -233,9 +233,9 @@ entry:
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
   ; SSE2: shift32i8
-  ; SSE2: cost of 320 {{.*}} lshr
+  ; SSE2: cost of 52 {{.*}} lshr
   ; SSE2-CODEGEN: shift32i8
-  ; SSE2-CODEGEN: shrb %cl
+  ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype32i8 %a , %b
   ret %shifttype32i8 %0
diff --git a/llvm/test/Analysis/CostModel/X86/testshiftshl.ll b/llvm/test/Analysis/CostModel/X86/testshiftshl.ll
index c36e0f5dfdf..d4e33818932 100644
--- a/llvm/test/Analysis/CostModel/X86/testshiftshl.ll
+++ b/llvm/test/Analysis/CostModel/X86/testshiftshl.ll
@@ -29,9 +29,9 @@ entry:
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
   ; SSE2: shift8i16
-  ; SSE2: cost of 80 {{.*}} shl
+  ; SSE2: cost of 32 {{.*}} shl
   ; SSE2-CODEGEN: shift8i16
-  ; SSE2-CODEGEN: shll %cl
+  ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype8i16 %a , %b
   ret %shifttype8i16 %0
@@ -41,9 +41,9 @@ entry:
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
   ; SSE2: shift16i16
-  ; SSE2: cost of 160 {{.*}} shl
+  ; SSE2: cost of 64 {{.*}} shl
   ; SSE2-CODEGEN: shift16i16
-  ; SSE2-CODEGEN: shll %cl
+  ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype16i16 %a , %b
   ret %shifttype16i16 %0
@@ -53,9 +53,9 @@ entry:
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
   ; SSE2: shift32i16
-  ; SSE2: cost of 320 {{.*}} shl
+  ; SSE2: cost of 128 {{.*}} shl
   ; SSE2-CODEGEN: shift32i16
-  ; SSE2-CODEGEN: shll %cl
+  ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype32i16 %a , %b
   ret %shifttype32i16 %0
@@ -209,9 +209,9 @@ entry:
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
   ; SSE2: shift8i8
-  ; SSE2: cost of 80 {{.*}} shl
+  ; SSE2: cost of 32 {{.*}} shl
   ; SSE2-CODEGEN: shift8i8
-  ; SSE2-CODEGEN: shll
+  ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype8i8 %a , %b
   ret %shifttype8i8 %0
@@ -221,9 +221,9 @@ entry:
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
   ; SSE2: shift16i8
-  ; SSE2: cost of 30 {{.*}} shl
+  ; SSE2: cost of 26 {{.*}} shl
   ; SSE2-CODEGEN: shift16i8
-  ; SSE2-CODEGEN: cmpeqb
+  ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype16i8 %a , %b
   ret %shifttype16i8 %0
@@ -233,9 +233,9 @@ entry:
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
   ; SSE2: shift32i8
-  ; SSE2: cost of 60 {{.*}} shl
+  ; SSE2: cost of 52 {{.*}} shl
   ; SSE2-CODEGEN: shift32i8
-  ; SSE2-CODEGEN: cmpeqb
+  ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype32i8 %a , %b
   ret %shifttype32i8 %0
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-06-11 07:46:37 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-06-11 07:46:37 +0000
commit	5965680d533900065a4f4b14d8ea5f025bb44ce3 (patch)
tree	09606be8248552a49355532b9f524fdc6f4f9d6f /llvm/test/Analysis
parent	2e8ffa3b4496d124c542679f9367b22abdbc5daf (diff)
download	bcm5719-llvm-5965680d533900065a4f4b14d8ea5f025bb44ce3.tar.gz bcm5719-llvm-5965680d533900065a4f4b14d8ea5f025bb44ce3.zip