diff options
author | Artem Belevich <tra@google.com> | 2017-11-14 19:14:00 +0000 |
---|---|---|
committer | Artem Belevich <tra@google.com> | 2017-11-14 19:14:00 +0000 |
commit | 55dcf5e586a470d13350fcf3b0b85993c73ce024 (patch) | |
tree | cfe00c61f0f9bfd22cfa430273a422769562b26f | |
parent | 35d90aea7a476da62be5dde06330e1032bb46f56 (diff) | |
download | bcm5719-llvm-55dcf5e586a470d13350fcf3b0b85993c73ce024.tar.gz bcm5719-llvm-55dcf5e586a470d13350fcf3b0b85993c73ce024.zip |
Mark intrinsics operating on the whole warp as IntrInaccessibleMemOnly
It's needed to model the fact that they do access data from other threads in a
warp and thus can't be CSE'd.
llvm-svn: 318173
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsNVVM.td | 56 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 21 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 10 |
3 files changed, 49 insertions, 38 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 249419d15d3..6f75e78ff61 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -3716,41 +3716,41 @@ def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">; // shfl.down.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_down_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.i32">, GCCBuiltin<"__nvvm_shfl_down_i32">; def int_nvvm_shfl_down_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.f32">, GCCBuiltin<"__nvvm_shfl_down_f32">; // shfl.up.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_up_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.i32">, GCCBuiltin<"__nvvm_shfl_up_i32">; def int_nvvm_shfl_up_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.f32">, GCCBuiltin<"__nvvm_shfl_up_f32">; // shfl.bfly.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_bfly_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">, GCCBuiltin<"__nvvm_shfl_bfly_i32">; def int_nvvm_shfl_bfly_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">, GCCBuiltin<"__nvvm_shfl_bfly_f32">; // shfl.idx.b32 dest, val, lane, mask_and_clamp def int_nvvm_shfl_idx_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.i32">, GCCBuiltin<"__nvvm_shfl_idx_i32">; def int_nvvm_shfl_idx_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, GCCBuiltin<"__nvvm_shfl_idx_f32">; // Synchronizing shfl variants available in CUDA-9. @@ -3760,41 +3760,41 @@ def int_nvvm_shfl_idx_f32 : // shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp def int_nvvm_shfl_sync_down_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">, GCCBuiltin<"__nvvm_shfl_sync_down_i32">; def int_nvvm_shfl_sync_down_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">, GCCBuiltin<"__nvvm_shfl_sync_down_f32">; // shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp def int_nvvm_shfl_sync_up_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">, GCCBuiltin<"__nvvm_shfl_sync_up_i32">; def int_nvvm_shfl_sync_up_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">, GCCBuiltin<"__nvvm_shfl_sync_up_f32">; // shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp def int_nvvm_shfl_sync_bfly_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">, GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">; def int_nvvm_shfl_sync_bfly_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">, GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">; // shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp def int_nvvm_shfl_sync_idx_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">, GCCBuiltin<"__nvvm_shfl_sync_idx_i32">; def int_nvvm_shfl_sync_idx_f32 : Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">, GCCBuiltin<"__nvvm_shfl_sync_idx_f32">; // @@ -3804,22 +3804,22 @@ def int_nvvm_shfl_sync_idx_f32 : // vote.all pred def int_nvvm_vote_all : Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.all">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all">, GCCBuiltin<"__nvvm_vote_all">; // vote.any pred def int_nvvm_vote_any : Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.any">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any">, GCCBuiltin<"__nvvm_vote_any">; // vote.uni pred def int_nvvm_vote_uni : Intrinsic<[llvm_i1_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.uni">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni">, GCCBuiltin<"__nvvm_vote_uni">; // vote.ballot pred def int_nvvm_vote_ballot : Intrinsic<[llvm_i32_ty], [llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.ballot">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot">, GCCBuiltin<"__nvvm_vote_ballot">; // @@ -3829,22 +3829,22 @@ def int_nvvm_vote_ballot : // vote.sync.all mask, pred def int_nvvm_vote_all_sync : Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.all.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all.sync">, GCCBuiltin<"__nvvm_vote_all_sync">; // vote.sync.any mask, pred def int_nvvm_vote_any_sync : Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.any.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any.sync">, GCCBuiltin<"__nvvm_vote_any_sync">; // vote.sync.uni mask, pred def int_nvvm_vote_uni_sync : Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.uni.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni.sync">, GCCBuiltin<"__nvvm_vote_uni_sync">; // vote.sync.ballot mask, pred def int_nvvm_vote_ballot_sync : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i1_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.vote.ballot.sync">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot.sync">, GCCBuiltin<"__nvvm_vote_ballot_sync">; // @@ -3853,12 +3853,12 @@ def int_nvvm_vote_ballot_sync : // match.any.sync.b32 mask, value def int_nvvm_match_any_sync_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.any.sync.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i32">, GCCBuiltin<"__nvvm_match_any_sync_i32">; // match.any.sync.b64 mask, value def int_nvvm_match_any_sync_i64 : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.any.sync.i64">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i64">, GCCBuiltin<"__nvvm_match_any_sync_i64">; // match.all instruction have two variants -- one returns a single value, another @@ -3868,11 +3868,11 @@ def int_nvvm_match_any_sync_i64 : // match.all.sync.b32p mask, value def int_nvvm_match_all_sync_i32p : Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">; + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">; // match.all.sync.b64p mask, value def int_nvvm_match_all_sync_i64p : Intrinsic<[llvm_i64_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; // // WMMA instructions diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index a7e58fa9738..ce6b071859b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -811,6 +811,10 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { switch (IID) { default: return false; + case Intrinsic::nvvm_match_all_sync_i32p: + case Intrinsic::nvvm_match_all_sync_i64p: + SelectMatchAll(N); + return true; case Intrinsic::nvvm_ldg_global_f: case Intrinsic::nvvm_ldg_global_i: case Intrinsic::nvvm_ldg_global_p: @@ -1025,10 +1029,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { case Intrinsic::nvvm_texsurf_handle_internal: SelectTexSurfHandle(N); return true; - case Intrinsic::nvvm_match_all_sync_i32p: - case Intrinsic::nvvm_match_all_sync_i64p: - SelectMatchAll(N); - return true; case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16: case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite: case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32: @@ -1075,12 +1075,13 @@ void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) { SDLoc DL(N); + SDValue Chain = N->getOperand(0); enum { IS_I64 = 4, HAS_CONST_VALUE = 2, HAS_CONST_MASK = 1 }; - unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); unsigned OpcodeIndex = (IID == Intrinsic::nvvm_match_all_sync_i64p) ? IS_I64 : 0; - SDValue MaskOp = N->getOperand(1); - SDValue ValueOp = N->getOperand(2); + SDValue MaskOp = N->getOperand(2); + SDValue ValueOp = N->getOperand(3); if (ConstantSDNode *ValueConst = dyn_cast<ConstantSDNode>(ValueOp)) { OpcodeIndex |= HAS_CONST_VALUE; ValueOp = CurDAG->getTargetConstant(ValueConst->getZExtValue(), DL, @@ -1097,9 +1098,9 @@ void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) { NVPTX::MATCH_ALLP_SYNC_32ir, NVPTX::MATCH_ALLP_SYNC_32ii, NVPTX::MATCH_ALLP_SYNC_64rr, NVPTX::MATCH_ALLP_SYNC_64ri, NVPTX::MATCH_ALLP_SYNC_64ir, NVPTX::MATCH_ALLP_SYNC_64ii}; - SDNode *NewNode = CurDAG->getMachineNode(Opcodes[OpcodeIndex], DL, - {ValueOp->getValueType(0), MVT::i1}, - {MaskOp, ValueOp}); + SDNode *NewNode = CurDAG->getMachineNode( + Opcodes[OpcodeIndex], DL, {ValueOp->getValueType(0), MVT::i1, MVT::Other}, + {MaskOp, ValueOp}); ReplaceNode(N, NewNode); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ac4f2544fc3..f141122ec48 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3321,6 +3321,16 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( switch (Intrinsic) { default: return false; + case Intrinsic::nvvm_match_all_sync_i32p: + case Intrinsic::nvvm_match_all_sync_i64p: + Info.opc = ISD::INTRINSIC_W_CHAIN; + // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute + // in order to model data exchange with other threads, but perform no real + // memory accesses. + Info.memVT = MVT::i1; + Info.readMem = true; // Our result depends on other thread's arguments. + Info.writeMem = true; // Other threads depend on our thread's argument. + return true; case Intrinsic::nvvm_wmma_load_a_f16_col: case Intrinsic::nvvm_wmma_load_a_f16_row: case Intrinsic::nvvm_wmma_load_a_f16_col_stride: |