summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfo.h13
-rw-r--r--llvm/include/llvm/Analysis/TargetTransformInfoImpl.h14
-rw-r--r--llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h16
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp10
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp35
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h2
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp32
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp2
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/nontemporal.ll112
-rw-r--r--llvm/test/Transforms/LoopVectorize/nontemporal.ll10
10 files changed, 234 insertions, 12 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 52d4cb73c5b..f53b17df012 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -531,6 +531,11 @@ public:
/// Return true if the target supports masked store.
bool isLegalMaskedLoad(Type *DataType) const;
+ /// Return true if the target supports nontemporal store.
+ bool isLegalNTStore(Type *DataType, unsigned Alignment) const;
+ /// Return true if the target supports nontemporal load.
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment) const;
+
/// Return true if the target supports masked scatter.
bool isLegalMaskedScatter(Type *DataType) const;
/// Return true if the target supports masked gather.
@@ -1118,6 +1123,8 @@ public:
virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
virtual bool isLegalMaskedStore(Type *DataType) = 0;
virtual bool isLegalMaskedLoad(Type *DataType) = 0;
+ virtual bool isLegalNTStore(Type *DataType, unsigned Alignment) = 0;
+ virtual bool isLegalNTLoad(Type *DataType, unsigned Alignment) = 0;
virtual bool isLegalMaskedScatter(Type *DataType) = 0;
virtual bool isLegalMaskedGather(Type *DataType) = 0;
virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
@@ -1373,6 +1380,12 @@ public:
bool isLegalMaskedLoad(Type *DataType) override {
return Impl.isLegalMaskedLoad(DataType);
}
+ bool isLegalNTStore(Type *DataType, unsigned Alignment) override {
+ return Impl.isLegalNTStore(DataType, Alignment);
+ }
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment) override {
+ return Impl.isLegalNTLoad(DataType, Alignment);
+ }
bool isLegalMaskedScatter(Type *DataType) override {
return Impl.isLegalMaskedScatter(DataType);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 2527495adc0..f8b36ec43a3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -229,6 +229,20 @@ public:
bool isLegalMaskedLoad(Type *DataType) { return false; }
+ bool isLegalNTStore(Type *DataType, unsigned Alignment) {
+ // By default, assume nontemporal memory stores are available for stores
+ // that are aligned and have a size that is a power of 2.
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ return Alignment >= DataSize && isPowerOf2_32(DataSize);
+ }
+
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment) {
+ // By default, assume nontemporal memory loads are available for loads that
+ // are aligned and have a size that is a power of 2.
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ return Alignment >= DataSize && isPowerOf2_32(DataSize);
+ }
+
bool isLegalMaskedScatter(Type *DataType) { return false; }
bool isLegalMaskedGather(Type *DataType) { return false; }
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 4089bfab754..b144006e262 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -205,12 +205,13 @@ class LoopVectorizationLegality {
public:
LoopVectorizationLegality(
Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
- TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
- std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
- OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
- LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
- : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
- ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AliasAnalysis *AA,
+ Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
+ LoopInfo *LI, OptimizationRemarkEmitter *ORE,
+ LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
+ AssumptionCache *AC)
+ : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
+ GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
@@ -402,6 +403,9 @@ private:
/// unrolling.
PredicatedScalarEvolution &PSE;
+ /// Target Transform Info.
+ TargetTransformInfo *TTI;
+
/// Target Library Info.
TargetLibraryInfo *TLI;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 763b6841878..a55c1be1a09 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -183,6 +183,16 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const {
return TTIImpl->isLegalMaskedLoad(DataType);
}
+bool TargetTransformInfo::isLegalNTStore(Type *DataType,
+ unsigned Alignment) const {
+ return TTIImpl->isLegalNTStore(DataType, Alignment);
+}
+
+bool TargetTransformInfo::isLegalNTLoad(Type *DataType,
+ unsigned Alignment) const {
+ return TTIImpl->isLegalNTLoad(DataType, Alignment);
+}
+
bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const {
return TTIImpl->isLegalMaskedGather(DataType);
}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2b9a61d4c87..08e46ed2ce3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3143,6 +3143,41 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
return isLegalMaskedLoad(DataType);
}
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+ // The only supported nontemporal loads are for aligned vectors of 16 or 32
+ // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
+ // (the equivalent stores only require AVX).
+ if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
+ return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2();
+
+ return false;
+}
+
+bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+ unsigned DataSize = DL.getTypeStoreSize(DataType);
+
+ // SSE4A supports nontemporal stores of float and double at arbitrary
+ // alignment.
+ if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
+ return true;
+
+ // Besides the SSE4A subtarget exception above, only aligned stores are
+ // available nontemporaly on any other subtarget. And only stores with a size
+ // of 4..32 bytes (powers of 2, only) are permitted.
+ if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
+ !isPowerOf2_32(DataSize))
+ return false;
+
+ // 32-byte vector nontemporal stores are supported by AVX (the equivalent
+ // loads require AVX2).
+ if (DataSize == 32)
+ return ST->hasAVX();
+ else if (DataSize == 16)
+ return ST->hasSSE1();
+ return true;
+}
+
bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
if (!isa<VectorType>(DataTy))
return false;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 351a4f22060..f43155e3838 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -186,6 +186,8 @@ public:
bool canMacroFuseCmp();
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
+ bool isLegalNTLoad(Type *DataType, unsigned Alignment);
+ bool isLegalNTStore(Type *DataType, unsigned Alignment);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e5713c4355f..6ef8dc2d3cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -767,6 +767,38 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
return false;
}
+ // For nontemporal stores, check that a nontemporal vector version is
+ // supported on the target.
+ if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+ // Arbitrarily try a vector of 2 elements.
+ Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+ assert(VecTy && "did not find vectorized version of stored type");
+ unsigned Alignment = getLoadStoreAlignment(ST);
+ if (!TTI->isLegalNTStore(VecTy, Alignment)) {
+ reportVectorizationFailure(
+ "nontemporal store instruction cannot be vectorized",
+ "nontemporal store instruction cannot be vectorized",
+ "CantVectorizeNontemporalStore", ST);
+ return false;
+ }
+ }
+
+ } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+ if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+ // For nontemporal loads, check that a nontemporal vector version is
+ // supported on the target (arbitrarily try a vector of 2 elements).
+ Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+ assert(VecTy && "did not find vectorized version of load type");
+ unsigned Alignment = getLoadStoreAlignment(LD);
+ if (!TTI->isLegalNTLoad(VecTy, Alignment)) {
+ reportVectorizationFailure(
+ "nontemporal load instruction cannot be vectorized",
+ "nontemporal load instruction cannot be vectorized",
+ "CantVectorizeNontemporalLoad", LD);
+ return false;
+ }
+ }
+
// FP instructions can allow unsafe algebra, thus vectorizable by
// non-IEEE-754 compliant SIMD units.
// This applies to floating-point math operations and calls, not memory
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 01104f68bb3..7b3b9ddfad4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7275,7 +7275,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
- LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
+ LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
&Requirements, &Hints, DB, AC);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
diff --git a/llvm/test/Transforms/LoopVectorize/X86/nontemporal.ll b/llvm/test/Transforms/LoopVectorize/X86/nontemporal.ll
new file mode 100644
index 00000000000..c83ca291c49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/nontemporal.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+; The three test-cases below are all based on modified versions of a simple copy-loop:
+;
+; void foo(unsigned *src, unsigned *dst, unsigned nElts) {
+; for (unsigned i = 0; i < nElts; ++i) {
+; unsigned tmp = src[i];
+; dst[i] = tmp;
+; }
+; }
+;
+; In the first version, there are no nontemporal stores or loads, and so vectorization
+; is safely done.
+;
+; In the second version, the store into dst[i] has the nontemporal hint. The alignment
+; on X86_64 for 'unsigned' is 4, so the vector store generally will not be aligned to the
+; vector size (of 16 here). Unaligned nontemporal vector stores are not supported on X86_64,
+; and so the vectorization is suppressed (because when vectorizing it, the nontemoral hint
+; would not be honored in the final code-gen).
+;
+; The third version is analogous to the second, except rather than the store, it is the
+; load from 'src[i]' that has the nontemporal hint. Vectorization is suppressed in this
+; case because (like stores) unaligned nontemoral vector loads are not supported on X86_64.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64"
+
+; CHECK-LABEL: @vectorTest(
+define void @vectorTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+ %cmp8 = icmp eq i32 %nElts, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %nElts to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; Check that we vectorized the load, and that there is no nontemporal hint.
+; CHECK: %wide.load = load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+; Check that we vectorized the store, and that there is no nontemporal hint.
+; CHECK: store <4 x i32> %wide.load, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+ %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+ store i32 %0, i32* %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTStoreTest(
+; Check that the vectorized type of the store does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTStoreTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+ %cmp8 = icmp eq i32 %nElts, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %nElts to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+; Check that the store is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: store i32 %{{[0-9]+}}, i32* %arrayidx2, align 4, !nontemporal !4
+ store i32 %0, i32* %arrayidx2, align 4, !nontemporal !0
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTLoadTest(
+; Check that the vectorized type of the load does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTLoadTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+ %cmp8 = icmp eq i32 %nElts, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %nElts to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+; Check that the load is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: load i32, i32* %arrayidx, align 4, !nontemporal !4
+ %0 = load i32, i32* %arrayidx, align 4, !nontemporal !0
+ %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+ store i32 %0, i32* %arrayidx2, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/nontemporal.ll b/llvm/test/Transforms/LoopVectorize/nontemporal.ll
index b5719e12e12..5df8b83f78a 100644
--- a/llvm/test/Transforms/LoopVectorize/nontemporal.ll
+++ b/llvm/test/Transforms/LoopVectorize/nontemporal.ll
@@ -14,19 +14,19 @@ for.body.preheader: ; preds = %entry
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-; Check that we don't lose !nontemporal hint when vectorizing loads.
-; CHECK: %wide.load{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of loads.
+; CHECK: load {{.*}} align 4, !nontemporal !0
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4, !nontemporal !0
; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
-; CHECK: %wide.load{{[0-9]+}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
+; CHECK: load {{.*}} align 4{{$}}
%arrayidx2 = getelementptr inbounds float, float* %c, i64 %indvars.iv
%1 = load float, float* %arrayidx2, align 4
%add = fadd float %0, %1
-; Check that we don't lose !nontemporal hint when vectorizing stores.
-; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of stores.
+; CHECK: store {{.*}} align 4, !nontemporal !0
%arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv
store float %add, float* %arrayidx4, align 4, !nontemporal !0
OpenPOWER on IntegriCloud