summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJustin Holewinski <jholewinski@nvidia.com>2014-07-16 19:45:35 +0000
committerJustin Holewinski <jholewinski@nvidia.com>2014-07-16 19:45:35 +0000
commitac451066f48820a0be4bccba0a64b7c2e2dc0c35 (patch)
tree9f5cd54b26dbc5609bb35425a60f23ca5fb76bcb
parenta2e5deb86dd5279a806876a31332666f338aba55 (diff)
downloadbcm5719-llvm-ac451066f48820a0be4bccba0a64b7c2e2dc0c35.tar.gz
bcm5719-llvm-ac451066f48820a0be4bccba0a64b7c2e2dc0c35.zip
[NVPTX] Honor alignment on vector loads/stores
We were not considering the stated alignment on vector loads/stores, leading us to generate vector instructions even when we do not have sufficient alignment. Now, for IR like: %1 = load <4 x float>, <4 x float>* %ptr, align 4 we will generate correct, conservative PTX like: ld.f32 ... [%ptr] ld.f32 ... [%ptr+4] ld.f32 ... [%ptr+8] ld.f32 ... [%ptr+12] Or if we have an alignment of 8 (for example), we can generate code like: ld.v2.f32 ... [%ptr] ld.v2.f32 ... [%ptr+8] llvm-svn: 213186
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp36
-rw-r--r--llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll77
2 files changed, 108 insertions, 5 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index cb452ff0725..91e24bb617d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1494,6 +1494,21 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
break;
}
+ MemSDNode *MemSD = cast<MemSDNode>(N);
+ const DataLayout *TD = getDataLayout();
+
+ unsigned Align = MemSD->getAlignment();
+ unsigned PrefAlign =
+ TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+ if (Align < PrefAlign) {
+ // This store is not sufficiently aligned, so bail out and let this vector
+ // store be scalarized. Note that we may still be able to emit smaller
+ // vector stores. For example, if we are storing a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return SDValue();
+ }
+
unsigned Opcode = 0;
EVT EltVT = ValVT.getVectorElementType();
unsigned NumElts = ValVT.getVectorNumElements();
@@ -1536,8 +1551,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
Ops.push_back(N->getOperand(i));
}
- MemSDNode *MemSD = cast<MemSDNode>(N);
-
SDValue NewSt = DAG.getMemIntrinsicNode(
Opcode, DL, DAG.getVTList(MVT::Other), Ops,
MemSD->getMemoryVT(), MemSD->getMemOperand());
@@ -3046,6 +3059,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+ const DataLayout *TD,
SmallVectorImpl<SDValue> &Results) {
EVT ResVT = N->getValueType(0);
SDLoc DL(N);
@@ -3073,6 +3087,20 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
break;
}
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+
+ unsigned Align = LD->getAlignment();
+ unsigned PrefAlign =
+ TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+ if (Align < PrefAlign) {
+ // This load is not sufficiently aligned, so bail out and let this vector
+ // load be scalarized. Note that we may still be able to emit smaller
+ // vector loads. For example, if we are loading a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return;
+ }
+
EVT EltVT = ResVT.getVectorElementType();
unsigned NumElts = ResVT.getVectorNumElements();
@@ -3109,8 +3137,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
OtherOps.push_back(N->getOperand(i));
- LoadSDNode *LD = cast<LoadSDNode>(N);
-
// The select routine does not have access to the LoadSDNode instance, so
// pass along the extension information
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
@@ -3283,7 +3309,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
default:
report_fatal_error("Unhandled custom legalization");
case ISD::LOAD:
- ReplaceLoadVector(N, DAG, Results);
+ ReplaceLoadVector(N, DAG, getDataLayout(), Results);
return;
case ISD::INTRINSIC_W_CHAIN:
ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
new file mode 100644
index 00000000000..90c9c4306de
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: t1
+define <4 x float> @t1(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK-NOT: ld.f32
+; CHECK: ld.u8
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 1
+ ret <4 x float> %r
+}
+
+; CHECK-LABEL: t2
+define <4 x float> @t2(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK: ld.f32
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 4
+ ret <4 x float> %r
+}
+
+; CHECK-LABEL: t3
+define <4 x float> @t3(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK: ld.v2
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 8
+ ret <4 x float> %r
+}
+
+; CHECK-LABEL: t4
+define <4 x float> @t4(i8* %p1) {
+; CHECK: ld.v4
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 16
+ ret <4 x float> %r
+}
+
+
+; CHECK-LABEL: s1
+define void @s1(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK-NOT: st.f32
+; CHECK: st.u8
+ store <4 x float> %v, <4 x float>* %p1, align 1
+ ret void
+}
+
+; CHECK-LABEL: s2
+define void @s2(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK: st.f32
+ store <4 x float> %v, <4 x float>* %p1, align 4
+ ret void
+}
+
+; CHECK-LABEL: s3
+define void @s3(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+ store <4 x float> %v, <4 x float>* %p1, align 8
+ ret void
+}
+
+; CHECK-LABEL: s4
+define void @s4(<4 x float>* %p1, <4 x float> %v) {
+; CHECK: st.v4
+ store <4 x float> %v, <4 x float>* %p1, align 16
+ ret void
+}
+
OpenPOWER on IntegriCloud