From 5c6ab2a0b1f2da22c8ce4fbfc022f599aaa4a2a6 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Mon, 14 Oct 2019 16:53:34 +0000 Subject: [NVPTX] Restructure shfl instrinsics and add variants that return a predicate. Also, amend constraints for non-sync variants that are no longer available on sm_70+ with PTX6.4+. Differential Revision: https://reviews.llvm.org/D68892 llvm-svn: 374790 --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 5 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 169 +++++++++++-------------------- 2 files changed, 63 insertions(+), 111 deletions(-) (limited to 'llvm/lib') diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index e5580258d8c..fe7a84f9a36 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; +def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ +def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" + "&& Subtarget->getPTXVersion() >= 64)">; + def useShortPtr : Predicate<"useShortPointers()">; def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 1752d3e0575..c52195fb044 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -56,6 +56,10 @@ class RegSeq { []); } +class THREADMASK_INFO { + list ret = !if(sync, [0,1], [0]); +} + //----------------------------------- // Synchronization and shuffle functions //----------------------------------- @@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt), [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>, Requires<[hasPTX60, hasSM30]>; - -// shfl.{up,down,bfly,idx}.b32 -multiclass SHFL { - // The last two parameters to shfl can be regs or imms. ptxas is smart - // enough to inline constant registers, so strictly speaking we don't need to - // handle immediates here. But it's easy enough, and it makes our ptx more - // readable. - def reg : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>; - - def imm1 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>; - - def imm2 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>; - - def imm3 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>; +class SHFL_INSTR + : NVPTXInst<(outs), (ins), "?", []> { + NVPTXRegClass rc = !cond( + !eq(reg, "i32"): Int32Regs, + !eq(reg, "f32"): Float32Regs); + string IntrName = "int_nvvm_shfl_" + # !if(sync, "sync_", "") + # mode + # "_" # reg + # !if(return_pred, "p", ""); + Intrinsic Intr = !cast(IntrName); + let InOperandList = !con( + !if(sync, + !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]), + (ins)), + (ins rc:$src), + !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]), + !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"]) + ); + let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst)); + let AsmString = "shfl." + # !if(sync, "sync.", "") + # mode # ".b32\t" + # "$dst" + # !if(return_pred, "|$pred", "") # ", " + # "$src, $offset, $mask" + # !if(sync, ", $threadmask", "") + # ";" + ; + let Pattern = [!con( + !foreach(tmp, OutOperandList, + !subst(outs, set, + !subst(i32imm, imm, tmp))), + (set !foreach(tmp, InOperandList, + !subst(ins, Intr, + !subst(i32imm, imm, tmp)))) + )]; } -defm INT_SHFL_DOWN_I32 : SHFL; -defm INT_SHFL_DOWN_F32 : SHFL; -defm INT_SHFL_UP_I32 : SHFL; -defm INT_SHFL_UP_F32 : SHFL; -defm INT_SHFL_BFLY_I32 : SHFL; -defm INT_SHFL_BFLY_F32 : SHFL; -defm INT_SHFL_IDX_I32 : SHFL; -defm INT_SHFL_IDX_F32 : SHFL; - -multiclass SHFL_SYNC { - // Threadmask and the last two parameters to shfl.sync can be regs or imms. - // ptxas is smart enough to inline constant registers, so strictly speaking we - // don't need to handle immediates here. But it's easy enough, and it makes - // our ptx more readable. - def rrr : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def rri : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def rir : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def rii : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; - - def irr : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def iri : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def iir : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def iii : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; +foreach sync = [0, 1] in { + foreach mode = ["up", "down", "bfly", "idx"] in { + foreach regclass = ["i32", "f32"] in { + foreach return_pred = [0, 1] in { + foreach offset_imm = [0, 1] in { + foreach mask_imm = [0, 1] in { + foreach threadmask_imm = THREADMASK_INFO.ret in { + def : SHFL_INSTR, + Requires; + } + } + } + } + } + } } -// On sm_70 these don't have to be convergent, so we may eventually want to -// implement non-convergent variant of this intrinsic. -defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC; -defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC; -defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC; -defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC; - - // vote.{all,any,uni,ballot} multiclass VOTE { def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred), -- cgit v1.2.3