summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/CodeGen/InterleavedAccessPass.cpp112
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll86
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll12
-rw-r--r--llvm/test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll86
-rw-r--r--llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll12
5 files changed, 303 insertions, 5 deletions
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 518e79543ba..3f111197685 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -40,6 +40,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
@@ -65,7 +66,7 @@ class InterleavedAccess : public FunctionPass {
public:
static char ID;
InterleavedAccess(const TargetMachine *TM = nullptr)
- : FunctionPass(ID), TM(TM), TLI(nullptr) {
+ : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
}
@@ -73,7 +74,13 @@ public:
bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
private:
+ DominatorTree *DT;
const TargetMachine *TM;
const TargetLowering *TLI;
@@ -84,13 +91,26 @@ private:
/// \brief Transform an interleaved store into target specific intrinsics.
bool lowerInterleavedStore(StoreInst *SI,
SmallVector<Instruction *, 32> &DeadInsts);
+
+ /// \brief Returns true if the uses of an interleaved load by the
+ /// extractelement instructions in \p Extracts can be replaced by uses of the
+ /// shufflevector instructions in \p Shuffles instead. If so, the necessary
+ /// replacements are also performed.
+ bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
+ ArrayRef<ShuffleVectorInst *> Shuffles);
};
} // end anonymous namespace.
char InterleavedAccess::ID = 0;
-INITIALIZE_TM_PASS(InterleavedAccess, "interleaved-access",
- "Lower interleaved memory accesses to target specific intrinsics",
- false, false)
+INITIALIZE_TM_PASS_BEGIN(
+ InterleavedAccess, "interleaved-access",
+ "Lower interleaved memory accesses to target specific intrinsics", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_TM_PASS_END(
+ InterleavedAccess, "interleaved-access",
+ "Lower interleaved memory accesses to target specific intrinsics", false,
+ false)
FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
return new InterleavedAccess(TM);
@@ -179,9 +199,18 @@ bool InterleavedAccess::lowerInterleavedLoad(
return false;
SmallVector<ShuffleVectorInst *, 4> Shuffles;
+ SmallVector<ExtractElementInst *, 4> Extracts;
- // Check if all users of this load are shufflevectors.
+ // Check if all users of this load are shufflevectors. If we encounter any
+ // users that are extractelement instructions, we save them to later check if
+ // they can be modifed to extract from one of the shufflevectors instead of
+ // the load.
for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
+ auto *Extract = dyn_cast<ExtractElementInst>(*UI);
+ if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
+ Extracts.push_back(Extract);
+ continue;
+ }
ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
return false;
@@ -217,6 +246,11 @@ bool InterleavedAccess::lowerInterleavedLoad(
Indices.push_back(Index);
}
+ // Try and modify users of the load that are extractelement instructions to
+ // use the shufflevector instructions instead of the load.
+ if (!tryReplaceExtracts(Extracts, Shuffles))
+ return false;
+
DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
// Try to create target specific intrinsics to replace the load and shuffles.
@@ -230,6 +264,73 @@ bool InterleavedAccess::lowerInterleavedLoad(
return true;
}
+bool InterleavedAccess::tryReplaceExtracts(
+ ArrayRef<ExtractElementInst *> Extracts,
+ ArrayRef<ShuffleVectorInst *> Shuffles) {
+
+ // If there aren't any extractelement instructions to modify, there's nothing
+ // to do.
+ if (Extracts.empty())
+ return true;
+
+ // Maps extractelement instructions to vector-index pairs. The extractlement
+ // instructions will be modified to use the new vector and index operands.
+ DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
+
+ for (auto *Extract : Extracts) {
+
+ // The vector index that is extracted.
+ auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
+ auto Index = IndexOperand->getSExtValue();
+
+ // Look for a suitable shufflevector instruction. The goal is to modify the
+ // extractelement instruction (which uses an interleaved load) to use one
+ // of the shufflevector instructions instead of the load.
+ for (auto *Shuffle : Shuffles) {
+
+ // If the shufflevector instruction doesn't dominate the extract, we
+ // can't create a use of it.
+ if (!DT->dominates(Shuffle, Extract))
+ continue;
+
+ // Inspect the indices of the shufflevector instruction. If the shuffle
+ // selects the same index that is extracted, we can modify the
+ // extractelement instruction.
+ SmallVector<int, 4> Indices;
+ Shuffle->getShuffleMask(Indices);
+ for (unsigned I = 0; I < Indices.size(); ++I)
+ if (Indices[I] == Index) {
+ assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
+ "Vector operations do not match");
+ ReplacementMap[Extract] = std::make_pair(Shuffle, I);
+ break;
+ }
+
+ // If we found a suitable shufflevector instruction, stop looking.
+ if (ReplacementMap.count(Extract))
+ break;
+ }
+
+ // If we did not find a suitable shufflevector instruction, the
+ // extractelement instruction cannot be modified, so we must give up.
+ if (!ReplacementMap.count(Extract))
+ return false;
+ }
+
+ // Finally, perform the replacements.
+ IRBuilder<> Builder(Extracts[0]->getContext());
+ for (auto &Replacement : ReplacementMap) {
+ auto *Extract = Replacement.first;
+ auto *Vector = Replacement.second.first;
+ auto Index = Replacement.second.second;
+ Builder.SetInsertPoint(Extract);
+ Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
+ Extract->eraseFromParent();
+ }
+
+ return true;
+}
+
bool InterleavedAccess::lowerInterleavedStore(
StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
if (!SI->isSimple())
@@ -262,6 +363,7 @@ bool InterleavedAccess::runOnFunction(Function &F) {
DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
TLI = TM->getSubtargetImpl(F)->getTargetLowering();
MaxFactor = TLI->getMaxSupportedInterleaveFactor();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll b/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll
new file mode 100644
index 00000000000..8628c4288c6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses-extract-user.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -mtriple=aarch64 -interleaved-access -S | FileCheck %s
+
+; CHECK-LABEL: @extract_user_basic(
+; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi(
+; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br label %if.merge
+
+if.merge:
+ %E2 = extractelement <8 x i32> %L, i32 2
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi_no_dom(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E2 = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_wrong_const_index(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_wrong_const_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 1
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_undef_index(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_undef_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 undef
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_var_index(
+; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
+define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 %I
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
index 1bc2a3ccb1c..845050156ba 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -268,3 +268,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
store <3 x float> %tmp1, <3 x float>* %p, align 16
ret void
}
+
+; NEON-LABEL: load_factor2_with_extract_user:
+; NEON: ld2 { v0.4s, v1.4s }, [x0]
+; NEON: mov w0, v0.s[1]
+; NONEON-LABEL: load_factor2_with_extract_user:
+; NONEON-NOT: ld2
+define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
+ %1 = load <8 x i32>, <8 x i32>* %a, align 8
+ %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = extractelement <8 x i32> %1, i32 2
+ ret i32 %3
+}
diff --git a/llvm/test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll b/llvm/test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll
new file mode 100644
index 00000000000..620cb635641
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -mtriple=arm-eabi -mattr=+neon -interleaved-access -S | FileCheck %s
+
+; CHECK-LABEL: @extract_user_basic(
+; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi(
+; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 0
+; CHECK: extractelement <4 x i32> %[[R]], i64 1
+define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br label %if.merge
+
+if.merge:
+ %E2 = extractelement <8 x i32> %L, i32 2
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_multi_no_dom(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %E1 = extractelement <8 x i32> %L, i32 0
+ br i1 %C, label %if.then, label %if.merge
+
+if.then:
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E2 = extractelement <8 x i32> %L, i32 2
+ br label %if.merge
+
+if.merge:
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_wrong_const_index(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_wrong_const_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 1
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_undef_index(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_undef_index(<8 x i32>* %A) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 undef
+ ret void
+}
+
+; CHECK-LABEL: @extract_user_var_index(
+; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
+define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
+entry:
+ %L = load <8 x i32>, <8 x i32>* %A, align 8
+ %S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %E = extractelement <8 x i32> %L, i32 %I
+ ret void
+}
diff --git a/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll b/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
index 002e71f6d9b..6f3d537176c 100644
--- a/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
+++ b/llvm/test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -304,3 +304,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
store <3 x float> %tmp1, <3 x float>* %p, align 16
ret void
}
+
+; NEON-LABEL: load_factor2_with_extract_user:
+; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64]
+; NEON: vmov.32 r0, d16[1]
+; NONEON-LABEL: load_factor2_with_extract_user:
+; NONEON-NOT: vld2
+define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
+ %1 = load <8 x i32>, <8 x i32>* %a, align 8
+ %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %3 = extractelement <8 x i32> %1, i32 2
+ ret i32 %3
+}
OpenPOWER on IntegriCloud