10 files changed, 234 insertions, 12 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 52d4cb73c5b..f53b17df012 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -531,6 +531,11 @@ public:
   /// Return true if the target supports masked store.
   bool isLegalMaskedLoad(Type *DataType) const;
 
+  /// Return true if the target supports nontemporal store.
+  bool isLegalNTStore(Type *DataType, unsigned Alignment) const;
+  /// Return true if the target supports nontemporal load.
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment) const;
+
   /// Return true if the target supports masked scatter.
   bool isLegalMaskedScatter(Type *DataType) const;
   /// Return true if the target supports masked gather.
@@ -1118,6 +1123,8 @@ public:
   virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
   virtual bool isLegalMaskedStore(Type *DataType) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
+  virtual bool isLegalNTStore(Type *DataType, unsigned Alignment) = 0;
+  virtual bool isLegalNTLoad(Type *DataType, unsigned Alignment) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
   virtual bool isLegalMaskedGather(Type *DataType) = 0;
   virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
@@ -1373,6 +1380,12 @@ public:
   bool isLegalMaskedLoad(Type *DataType) override {
     return Impl.isLegalMaskedLoad(DataType);
   }
+  bool isLegalNTStore(Type *DataType, unsigned Alignment) override {
+    return Impl.isLegalNTStore(DataType, Alignment);
+  }
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment) override {
+    return Impl.isLegalNTLoad(DataType, Alignment);
+  }
   bool isLegalMaskedScatter(Type *DataType) override {
     return Impl.isLegalMaskedScatter(DataType);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 2527495adc0..f8b36ec43a3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -229,6 +229,20 @@ public:
 
   bool isLegalMaskedLoad(Type *DataType) { return false; }
 
+  bool isLegalNTStore(Type *DataType, unsigned Alignment) {
+    // By default, assume nontemporal memory stores are available for stores
+    // that are aligned and have a size that is a power of 2.
+    unsigned DataSize = DL.getTypeStoreSize(DataType);
+    return Alignment >= DataSize && isPowerOf2_32(DataSize);
+  }
+
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment) {
+    // By default, assume nontemporal memory loads are available for loads that
+    // are aligned and have a size that is a power of 2.
+    unsigned DataSize = DL.getTypeStoreSize(DataType);
+    return Alignment >= DataSize && isPowerOf2_32(DataSize);
+  }
+
   bool isLegalMaskedScatter(Type *DataType) { return false; }
 
   bool isLegalMaskedGather(Type *DataType) { return false; }
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 4089bfab754..b144006e262 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -205,12 +205,13 @@ class LoopVectorizationLegality {
 public:
   LoopVectorizationLegality(
       Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
-      TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
-      std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
-      OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
-      LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
-      : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
-        ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+      TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AliasAnalysis *AA,
+      Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
+      LoopInfo *LI, OptimizationRemarkEmitter *ORE,
+      LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
+      AssumptionCache *AC)
+      : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
+        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -402,6 +403,9 @@ private:
   /// unrolling.
   PredicatedScalarEvolution &PSE;
 
+  /// Target Transform Info.
+  TargetTransformInfo *TTI;
+
   /// Target Library Info.
   TargetLibraryInfo *TLI;
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 763b6841878..a55c1be1a09 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -183,6 +183,16 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType) const {
   return TTIImpl->isLegalMaskedLoad(DataType);
 }
 
+bool TargetTransformInfo::isLegalNTStore(Type *DataType,
+                                         unsigned Alignment) const {
+  return TTIImpl->isLegalNTStore(DataType, Alignment);
+}
+
+bool TargetTransformInfo::isLegalNTLoad(Type *DataType,
+                                        unsigned Alignment) const {
+  return TTIImpl->isLegalNTLoad(DataType, Alignment);
+}
+
 bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const {
   return TTIImpl->isLegalMaskedGather(DataType);
 }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 2b9a61d4c87..08e46ed2ce3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3143,6 +3143,41 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
   return isLegalMaskedLoad(DataType);
 }
 
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+  unsigned DataSize = DL.getTypeStoreSize(DataType);
+  // The only supported nontemporal loads are for aligned vectors of 16 or 32
+  // bytes.  Note that 32-byte nontemporal vector loads are supported by AVX2
+  // (the equivalent stores only require AVX).
+  if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32))
+    return DataSize == 16 ?  ST->hasSSE1() : ST->hasAVX2();
+
+  return false;
+}
+
+bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+  unsigned DataSize = DL.getTypeStoreSize(DataType);
+
+  // SSE4A supports nontemporal stores of float and double at arbitrary
+  // alignment.
+  if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy()))
+    return true;
+
+  // Besides the SSE4A subtarget exception above, only aligned stores are
+  // available nontemporaly on any other subtarget.  And only stores with a size
+  // of 4..32 bytes (powers of 2, only) are permitted.
+  if (Alignment < DataSize || DataSize < 4 || DataSize > 32 ||
+      !isPowerOf2_32(DataSize))
+    return false;
+
+  // 32-byte vector nontemporal stores are supported by AVX (the equivalent
+  // loads require AVX2).
+  if (DataSize == 32)
+    return ST->hasAVX();
+  else if (DataSize == 16)
+    return ST->hasSSE1();
+  return true;
+}
+
 bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
   if (!isa<VectorType>(DataTy))
     return false;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 351a4f22060..f43155e3838 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -186,6 +186,8 @@ public:
   bool canMacroFuseCmp();
   bool isLegalMaskedLoad(Type *DataType);
   bool isLegalMaskedStore(Type *DataType);
+  bool isLegalNTLoad(Type *DataType, unsigned Alignment);
+  bool isLegalNTStore(Type *DataType, unsigned Alignment);
   bool isLegalMaskedGather(Type *DataType);
   bool isLegalMaskedScatter(Type *DataType);
   bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e5713c4355f..6ef8dc2d3cd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -767,6 +767,38 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           return false;
         }
 
+        // For nontemporal stores, check that a nontemporal vector version is
+        // supported on the target.
+        if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+          // Arbitrarily try a vector of 2 elements.
+          Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+          assert(VecTy && "did not find vectorized version of stored type");
+          unsigned Alignment = getLoadStoreAlignment(ST);
+          if (!TTI->isLegalNTStore(VecTy, Alignment)) {
+            reportVectorizationFailure(
+                "nontemporal store instruction cannot be vectorized",
+                "nontemporal store instruction cannot be vectorized",
+                "CantVectorizeNontemporalStore", ST);
+            return false;
+          }
+        }
+
+      } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+        if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+          // For nontemporal loads, check that a nontemporal vector version is
+          // supported on the target (arbitrarily try a vector of 2 elements).
+          Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+          assert(VecTy && "did not find vectorized version of load type");
+          unsigned Alignment = getLoadStoreAlignment(LD);
+          if (!TTI->isLegalNTLoad(VecTy, Alignment)) {
+            reportVectorizationFailure(
+                "nontemporal load instruction cannot be vectorized",
+                "nontemporal load instruction cannot be vectorized",
+                "CantVectorizeNontemporalLoad", LD);
+            return false;
+          }
+        }
+
         // FP instructions can allow unsafe algebra, thus vectorizable by
         // non-IEEE-754 compliant SIMD units.
         // This applies to floating-point math operations and calls, not memory
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 01104f68bb3..7b3b9ddfad4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7275,7 +7275,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check if it is legal to vectorize the loop.
   LoopVectorizationRequirements Requirements(*ORE);
-  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
+  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
                                 &Requirements, &Hints, DB, AC);
   if (!LVL.canVectorize(EnableVPlanNativePath)) {
     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
diff --git a/llvm/test/Transforms/LoopVectorize/X86/nontemporal.ll b/llvm/test/Transforms/LoopVectorize/X86/nontemporal.ll
new file mode 100644
index 00000000000..c83ca291c49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/nontemporal.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+; The three test-cases below are all based on modified versions of a simple copy-loop:
+;
+; void foo(unsigned *src, unsigned *dst, unsigned nElts) {
+;   for (unsigned i = 0; i < nElts; ++i) {
+;     unsigned tmp = src[i];
+;     dst[i] = tmp;
+;   }
+; }
+;
+; In the first version, there are no nontemporal stores or loads, and so vectorization
+; is safely done.
+;
+; In the second version, the store into dst[i] has the nontemporal hint.  The alignment
+; on X86_64 for 'unsigned' is 4, so the vector store generally will not be aligned to the
+; vector size (of 16 here).  Unaligned nontemporal vector stores are not supported on X86_64,
+; and so the vectorization is suppressed (because when vectorizing it, the nontemoral hint
+; would not be honored in the final code-gen).
+;
+; The third version is analogous to the second, except rather than the store, it is the
+; load from 'src[i]' that has the nontemporal hint.  Vectorization is suppressed in this
+; case because (like stores) unaligned nontemoral vector loads are not supported on X86_64.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64"
+
+; CHECK-LABEL: @vectorTest(
+define void @vectorTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+  %cmp8 = icmp eq i32 %nElts, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %nElts to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+; Check that we vectorized the load, and that there is no nontemporal hint.
+; CHECK: %wide.load = load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+; Check that we vectorized the store, and that there is no nontemporal hint.
+; CHECK: store <4 x i32> %wide.load, <4 x i32>* %{{[0-9]+}}, align 4{{$}}
+  %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTStoreTest(
+; Check that the vectorized type of the store does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTStoreTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+  %cmp8 = icmp eq i32 %nElts, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %nElts to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+; Check that the store is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: store i32 %{{[0-9]+}}, i32* %arrayidx2, align 4, !nontemporal !4
+  store i32 %0, i32* %arrayidx2, align 4, !nontemporal !0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @vectorNTLoadTest(
+; Check that the vectorized type of the load does not appear.
+; CHECK-NOT: 4 x i32
+define void @vectorNTLoadTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) {
+entry:
+  %cmp8 = icmp eq i32 %nElts, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %nElts to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+; Check that the load is not vectorized and that we don't lose the !nontemporal hint in it.
+; CHECK: load i32, i32* %arrayidx, align 4, !nontemporal !4
+  %0 = load i32, i32* %arrayidx, align 4, !nontemporal !0
+  %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/nontemporal.ll b/llvm/test/Transforms/LoopVectorize/nontemporal.ll
index b5719e12e12..5df8b83f78a 100644
--- a/llvm/test/Transforms/LoopVectorize/nontemporal.ll
+++ b/llvm/test/Transforms/LoopVectorize/nontemporal.ll
@@ -14,19 +14,19 @@ for.body.preheader:                               ; preds = %entry
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
 
-; Check that we don't lose !nontemporal hint when vectorizing loads.
-; CHECK: %wide.load{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of loads.
+; CHECK: load {{.*}} align 4, !nontemporal !0
   %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
   %0 = load float, float* %arrayidx, align 4, !nontemporal !0
 
 ; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
-; CHECK: %wide.load{{[0-9]+}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
+; CHECK: load {{.*}} align 4{{$}}
   %arrayidx2 = getelementptr inbounds float, float* %c, i64 %indvars.iv
   %1 = load float, float* %arrayidx2, align 4
   %add = fadd float %0, %1
 
-; Check that we don't lose !nontemporal hint when vectorizing stores.
-; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+; Check that we don't lose !nontemporal hint when attempting vectorizing of stores.
+; CHECK: store {{.*}} align 4, !nontemporal !0
   %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv
   store float %add, float* %arrayidx4, align 4, !nontemporal !0