diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-05-18 16:10:19 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-05-18 16:10:19 +0000 |
commit | 1735da460b1494a9fcc40b6af8006e78fa3f1475 (patch) | |
tree | 9335825ce59f9ee06c8668ec8da90fe3db8fa623 | |
parent | 8eb336c14e4c033a3f81a3a43fe397f0879a2e68 (diff) | |
download | bcm5719-llvm-1735da460b1494a9fcc40b6af8006e78fa3f1475.tar.gz bcm5719-llvm-1735da460b1494a9fcc40b6af8006e78fa3f1475.zip |
AMDGPU: Other sizes of popcnt are fast
We can chain bcnt instructions together, so
any width popcnt is pretty fast.
llvm-svn: 269950
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/ctpop64.ll | 52 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll | 25 |
3 files changed, 77 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 036f627e466..25187a280e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -76,7 +76,7 @@ public: TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + return TTI::PSK_FastHardware; } unsigned getNumberOfRegisters(bool Vector); diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index e9e995ecda9..b141d608658 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -7,6 +7,9 @@ declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone +declare i65 @llvm.ctpop.i65(i65) nounwind readnone +declare i128 @llvm.ctpop.i128(i128) nounwind readnone + ; FUNC-LABEL: {{^}}s_ctpop_i64: ; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c @@ -141,3 +144,52 @@ endif: store i64 %tmp5, i64 addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}s_ctpop_i128: +; GCN: s_bcnt1_i32_b64 [[SRESULT0:s[0-9]+]], +; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]], +; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT0]], [[SRESULT1]] +; GCN: s_endpgm +define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind { + %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone + %truncctpop = trunc i128 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_ctpop_i65: +; GCN: s_bcnt1_i32_b64 +; GCN: s_and_b32 +; GCN: s_bcnt1_i32_b64 +; GCN: s_add_i32 +; GCN: s_endpgm +define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { + %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone + %truncctpop = trunc i65 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: Should not have extra add + +; FUNC-LABEL: {{^}}v_ctpop_i128: +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} + +; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v[[VAL2]], 0 +; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] + +; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 +; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v[[VAL1]], [[MIDRESULT2]] + +; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT2]], [[MIDRESULT1]] + +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { + %val = load i128, i128 addrspace(1)* %in, align 8 + %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone + %truncctpop = trunc i128 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll b/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll index e4301bbb06d..e594c79a3e1 100644 --- a/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll +++ b/llvm/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-idiom -mtriple=r600-- -mcpu=SI -S < %s | FileCheck %s +; RUN: opt -loop-idiom -mtriple=amdgcn-- -S < %s | FileCheck %s ; Mostly copied from x86 version. @@ -59,6 +59,29 @@ while.end: ; preds = %while.body, %entry ret i32 %c.0.lcssa } +; CHECK-LABEL: @popcount_i128 +; CHECK: entry +; CHECK: llvm.ctpop.i128 +; CHECK: ret +define i32 @popcount_i128(i128 %a) nounwind uwtable readnone ssp { +entry: + %tobool3 = icmp eq i128 %a, 0 + br i1 %tobool3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %a.addr.04 = phi i128 [ %and, %while.body ], [ %a, %entry ] + %inc = add nsw i32 %c.05, 1 + %sub = add i128 %a.addr.04, -1 + %and = and i128 %sub, %a.addr.04 + %tobool = icmp eq i128 %and, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] + ret i32 %c.0.lcssa +} + ; To recognize this pattern: ;int popcount(unsigned long long a, int mydata1, int mydata2) { ; int c = 0; |