summaryrefslogtreecommitdiffstats
path: root/llvm/lib/IR/AutoUpgrade.cpp
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-01-14 19:24:10 +0000
committerCraig Topper <craig.topper@intel.com>2018-01-14 19:24:10 +0000
commit7197a452fc79138b991f89941e1968b4d544e796 (patch)
tree35b0e47bb6fccb07aeb060fffbf6595e4a9c7e5d /llvm/lib/IR/AutoUpgrade.cpp
parentf517f1a5160577f03d03820020a81d85447bc2fe (diff)
downloadbcm5719-llvm-7197a452fc79138b991f89941e1968b4d544e796.tar.gz
bcm5719-llvm-7197a452fc79138b991f89941e1968b4d544e796.zip
[X86] Autoupgrade kunpck intrinsics using vector operations instead of scalar operations
Summary: This patch changes the kunpck intrinsic autoupgrade to use vXi1 shufflevector operations to perform vector extracts and concats. This more closely matches the definition of the kunpck instructions. Currently we rely on a DAG combine to turn the scalar shift/and/or code into a concat vectors operation. By doing it in the IR we get this for free. Reviewers: spatel, RKSimon, zvi, jina.nahias Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D42018 llvm-svn: 322462
Diffstat (limited to 'llvm/lib/IR/AutoUpgrade.cpp')
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp22
1 files changed, 17 insertions, 5 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 98fd616d0fc..bf96e772394 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1070,11 +1070,23 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
CI->getArgOperand(1));
} else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
- uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
- uint64_t And = (1ULL << Shift) - 1;
- Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And);
- Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift);
- Rep = Builder.CreateOr(LowBits, HighBits);
+ unsigned NumElts = CI->getType()->getScalarSizeInBits();
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
+ uint32_t Indices[64];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i;
+
+ // First extract half of each vector. This gives better codegen than
+ // doing it in a single shuffle.
+ LHS = Builder.CreateShuffleVector(LHS, LHS,
+ makeArrayRef(Indices, NumElts / 2));
+ RHS = Builder.CreateShuffleVector(RHS, RHS,
+ makeArrayRef(Indices, NumElts / 2));
+ // Concat the vectors.
+ Rep = Builder.CreateShuffleVector(LHS, RHS,
+ makeArrayRef(Indices, NumElts));
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
} else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
Type *I32Ty = Type::getInt32Ty(C);
Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
OpenPOWER on IntegriCloud