[NVPTX] Honor alignment on vector loads/stores

We were not considering the stated alignment on vector loads/stores, leading us to generate vector instructions even when we do not have sufficient alignment. Now, for IR like: %1 = load <4 x float>, <4 x float>* %ptr, align 4 we will generate correct, conservative PTX like: ld.f32 ... [%ptr] ld.f32 ... [%ptr+4] ld.f32 ... [%ptr+8] ld.f32 ... [%ptr+12] Or if we have an alignment of 8 (for example), we can generate code like: ld.v2.f32 ... [%ptr] ld.v2.f32 ... [%ptr+8] llvm-svn: 213186
author: Justin Holewinski <jholewinski@nvidia.com> 2014-07-16 19:45:35 +0000
committer: Justin Holewinski <jholewinski@nvidia.com> 2014-07-16 19:45:35 +0000
commit: ac451066f48820a0be4bccba0a64b7c2e2dc0c35 (patch)
tree: 9f5cd54b26dbc5609bb35425a60f23ca5fb76bcb
parent: a2e5deb86dd5279a806876a31332666f338aba55 (diff)
download: bcm5719-llvm-ac451066f48820a0be4bccba0a64b7c2e2dc0c35.tar.gz
bcm5719-llvm-ac451066f48820a0be4bccba0a64b7c2e2dc0c35.zip
2 files changed, 108 insertions, 5 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index cb452ff0725..91e24bb617d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1494,6 +1494,21 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
       break;
     }
 
+    MemSDNode *MemSD = cast<MemSDNode>(N);
+    const DataLayout *TD = getDataLayout();
+
+    unsigned Align = MemSD->getAlignment();
+    unsigned PrefAlign =
+      TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+    if (Align < PrefAlign) {
+      // This store is not sufficiently aligned, so bail out and let this vector
+      // store be scalarized.  Note that we may still be able to emit smaller
+      // vector stores.  For example, if we are storing a <4 x float> with an
+      // alignment of 8, this check will fail but the legalizer will try again
+      // with 2 x <2 x float>, which will succeed with an alignment of 8.
+      return SDValue();
+    }
+
     unsigned Opcode = 0;
     EVT EltVT = ValVT.getVectorElementType();
     unsigned NumElts = ValVT.getVectorNumElements();
@@ -1536,8 +1551,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
       Ops.push_back(N->getOperand(i));
     }
 
-    MemSDNode *MemSD = cast<MemSDNode>(N);
-
     SDValue NewSt = DAG.getMemIntrinsicNode(
         Opcode, DL, DAG.getVTList(MVT::Other), Ops,
         MemSD->getMemoryVT(), MemSD->getMemOperand());
@@ -3046,6 +3059,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
 
 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              const DataLayout *TD,
                               SmallVectorImpl<SDValue> &Results) {
   EVT ResVT = N->getValueType(0);
   SDLoc DL(N);
@@ -3073,6 +3087,20 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
     break;
   }
 
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  unsigned Align = LD->getAlignment();
+  unsigned PrefAlign =
+    TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+  if (Align < PrefAlign) {
+    // This load is not sufficiently aligned, so bail out and let this vector
+    // load be scalarized.  Note that we may still be able to emit smaller
+    // vector loads.  For example, if we are loading a <4 x float> with an
+    // alignment of 8, this check will fail but the legalizer will try again
+    // with 2 x <2 x float>, which will succeed with an alignment of 8.
+    return;
+  }
+
   EVT EltVT = ResVT.getVectorElementType();
   unsigned NumElts = ResVT.getVectorNumElements();
 
@@ -3109,8 +3137,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
     OtherOps.push_back(N->getOperand(i));
 
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-
   // The select routine does not have access to the LoadSDNode instance, so
   // pass along the extension information
   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
@@ -3283,7 +3309,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   default:
     report_fatal_error("Unhandled custom legalization");
   case ISD::LOAD:
-    ReplaceLoadVector(N, DAG, Results);
+    ReplaceLoadVector(N, DAG, getDataLayout(), Results);
     return;
   case ISD::INTRINSIC_W_CHAIN:
     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
new file mode 100644
index 00000000000..90c9c4306de
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: t1
+define <4 x float> @t1(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK-NOT: ld.f32
+; CHECK: ld.u8
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 1
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t2
+define <4 x float> @t2(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK: ld.f32
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 4
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t3
+define <4 x float> @t3(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK: ld.v2
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 8
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t4
+define <4 x float> @t4(i8* %p1) {
+; CHECK: ld.v4
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 16
+  ret <4 x float> %r
+}
+
+
+; CHECK-LABEL: s1
+define void @s1(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK-NOT: st.f32
+; CHECK: st.u8
+  store <4 x float> %v, <4 x float>* %p1, align 1
+  ret void
+}
+
+; CHECK-LABEL: s2
+define void @s2(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK: st.f32
+  store <4 x float> %v, <4 x float>* %p1, align 4
+  ret void
+}
+
+; CHECK-LABEL: s3
+define void @s3(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+  store <4 x float> %v, <4 x float>* %p1, align 8
+  ret void
+}
+
+; CHECK-LABEL: s4
+define void @s4(<4 x float>* %p1, <4 x float> %v) {
+; CHECK: st.v4
+  store <4 x float> %v, <4 x float>* %p1, align 16
+  ret void
+}
+
author	Justin Holewinski <jholewinski@nvidia.com>	2014-07-16 19:45:35 +0000
committer	Justin Holewinski <jholewinski@nvidia.com>	2014-07-16 19:45:35 +0000
commit	ac451066f48820a0be4bccba0a64b7c2e2dc0c35 (patch)
tree	9f5cd54b26dbc5609bb35425a60f23ca5fb76bcb
parent	a2e5deb86dd5279a806876a31332666f338aba55 (diff)
download	bcm5719-llvm-ac451066f48820a0be4bccba0a64b7c2e2dc0c35.tar.gz bcm5719-llvm-ac451066f48820a0be4bccba0a64b7c2e2dc0c35.zip