diff options
| author | Craig Topper <craig.topper@intel.com> | 2018-01-14 19:24:10 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2018-01-14 19:24:10 +0000 |
| commit | 7197a452fc79138b991f89941e1968b4d544e796 (patch) | |
| tree | 35b0e47bb6fccb07aeb060fffbf6595e4a9c7e5d /llvm/lib/IR/AutoUpgrade.cpp | |
| parent | f517f1a5160577f03d03820020a81d85447bc2fe (diff) | |
| download | bcm5719-llvm-7197a452fc79138b991f89941e1968b4d544e796.tar.gz bcm5719-llvm-7197a452fc79138b991f89941e1968b4d544e796.zip | |
[X86] Autoupgrade kunpck intrinsics using vector operations instead of scalar operations
Summary: This patch changes the kunpck intrinsic autoupgrade to use vXi1 shufflevector operations to perform vector extracts and concats. This more closely matches the definition of the kunpck instructions. Currently we rely on a DAG combine to turn the scalar shift/and/or code into a concat vectors operation. By doing it in the IR we get this for free.
Reviewers: spatel, RKSimon, zvi, jina.nahias
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D42018
llvm-svn: 322462
Diffstat (limited to 'llvm/lib/IR/AutoUpgrade.cpp')
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 22 |
1 files changed, 17 insertions, 5 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 98fd616d0fc..bf96e772394 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1070,11 +1070,23 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx512.kunpck"))) { - uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2; - uint64_t And = (1ULL << Shift) - 1; - Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And); - Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift); - Rep = Builder.CreateOr(LowBits, HighBits); + unsigned NumElts = CI->getType()->getScalarSizeInBits(); + Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts); + Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts); + uint32_t Indices[64]; + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = i; + + // First extract half of each vector. This gives better codegen than + // doing it in a single shuffle. + LHS = Builder.CreateShuffleVector(LHS, LHS, + makeArrayRef(Indices, NumElts / 2)); + RHS = Builder.CreateShuffleVector(RHS, RHS, + makeArrayRef(Indices, NumElts / 2)); + // Concat the vectors. + Rep = Builder.CreateShuffleVector(LHS, RHS, + makeArrayRef(Indices, NumElts)); + Rep = Builder.CreateBitCast(Rep, CI->getType()); } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) { Type *I32Ty = Type::getInt32Ty(C); Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0), |

