summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2017-01-16 11:30:41 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2017-01-16 11:30:41 +0000
commit73a68c25a07bed73e6288f7288c0bc3fa3c60ecc (patch)
tree09b180e38a0daaa82c5f8f916db29465aaf4018b
parent59d725cabf44b53be32ab4402e59f89e52b51f26 (diff)
downloadbcm5719-llvm-73a68c25a07bed73e6288f7288c0bc3fa3c60ecc.tar.gz
bcm5719-llvm-73a68c25a07bed73e6288f7288c0bc3fa3c60ecc.zip
[InstCombine][SSE] Add DemandedElts support for PSHUFB instructions
Simplify a pshufb shuffle mask based on the elements of the mask that are actually demanded. Differential Revision: https://reviews.llvm.org/D28745 llvm-svn: 292101
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp12
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp10
-rw-r--r--llvm/test/Transforms/InstCombine/x86-pshufb.ll23
3 files changed, 30 insertions, 15 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 2ef82ba3ed8..ec2ebaaed88 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2315,10 +2315,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
- case Intrinsic::x86_avx512_pshuf_b_512:
+ case Intrinsic::x86_avx512_pshuf_b_512: {
if (Value *V = simplifyX86pshufb(*II, *Builder))
return replaceInstUsesWith(*II, V);
+
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt DemandedElts = APInt::getAllOnesValue(VWidth);
+ if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
+ if (V != II)
+ return replaceInstUsesWith(*II, V);
+ return II;
+ }
break;
+ }
case Intrinsic::x86_avx_vpermilvar_ps:
case Intrinsic::x86_avx_vpermilvar_ps_256:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8b930bd95df..95100d074b4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1472,6 +1472,16 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
}
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512: {
+ Value *Op1 = II->getArgOperand(1);
+ TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts,
+ Depth + 1);
+ if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+ break;
+ }
+
// SSE4A instructions leave the upper 64-bits of the 128-bit result
// in an undefined state.
case Intrinsic::x86_sse4a_extrq:
diff --git a/llvm/test/Transforms/InstCombine/x86-pshufb.ll b/llvm/test/Transforms/InstCombine/x86-pshufb.ll
index 7da216f7e48..f181ef57fe2 100644
--- a/llvm/test/Transforms/InstCombine/x86-pshufb.ll
+++ b/llvm/test/Transforms/InstCombine/x86-pshufb.ll
@@ -469,15 +469,12 @@ define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) {
}
; Demanded elts tests.
-; FIXME: Missed opportunities to pass demanded elts through the pshufb shuffle mask
define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) {
; CHECK-LABEL: @demanded_elts_insertion(
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 %M15, i32 15
-; CHECK-NEXT: [[TMP3:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> [[TMP2]])
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
-; CHECK-NEXT: ret <16 x i8> [[TMP4]]
+; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+; CHECK-NEXT: ret <16 x i8> [[TMP2]]
;
%1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
%2 = insertelement <16 x i8> %1, i8 %M15, i32 15
@@ -489,9 +486,8 @@ define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask,
define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
; CHECK-LABEL: @demanded_elts_insertion_avx2(
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> [[TMP1]], i8 %M22, i32 22
-; CHECK-NEXT: [[TMP3:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP2]])
-; CHECK-NEXT: ret <32 x i8> [[TMP3]]
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> [[TMP1]])
+; CHECK-NEXT: ret <32 x i8> [[TMP2]]
;
%1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
%2 = insertelement <32 x i8> %1, i8 %M22, i32 22
@@ -502,11 +498,10 @@ define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %Base
define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) {
; CHECK-LABEL: @demanded_elts_insertion_avx512(
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <64 x i8> [[TMP1]], i8 %M30, i32 30
-; CHECK-NEXT: [[TMP3:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP2]])
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <64 x i8> [[TMP3]], <64 x i8> undef, <64 x i32> zeroinitializer
-; CHECK-NEXT: ret <64 x i8> [[TMP4]]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT: ret <64 x i8> [[TMP3]]
;
%1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
%2 = insertelement <64 x i8> %1, i8 %M30, i32 30
OpenPOWER on IntegriCloud