diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-10-22 16:27:27 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-10-22 16:27:27 +0000 |
| commit | 687ec75d10bd860edb194d88d5438dcb1bc6eb92 (patch) | |
| tree | 89cfd5a61dee8ea2ffaadea0623d22eb28a86278 /llvm/test/CodeGen/AMDGPU/reduction.ll | |
| parent | b96181c2bf1d068824c6fd635c0921d0ffd20187 (diff) | |
| download | bcm5719-llvm-687ec75d10bd860edb194d88d5438dcb1bc6eb92.tar.gz bcm5719-llvm-687ec75d10bd860edb194d88d5438dcb1bc6eb92.zip | |
DAG: Change behavior of fminnum/fmaxnum nodes
Introduce new versions that follow the IEEE semantics
to help with legalization that may need quieted inputs.
There are some regressions from inserting unnecessary
canonicalizes when these are matched from fast math
fcmp + select which should be fixed in a future commit.
llvm-svn: 344914
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/reduction.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/reduction.ll | 112 |
1 files changed, 91 insertions, 21 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll index 74ca4a668f9..0c605f79d98 100644 --- a/llvm/test/CodeGen/AMDGPU/reduction.ll +++ b/llvm/test/CodeGen/AMDGPU/reduction.ll @@ -434,12 +434,23 @@ entry: } ; GCN-LABEL: {{^}}reduction_maxnum_v4f16: -; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 - -; VI: v_max_f16_sdwa -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 +; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 +; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} + +; FIXME: Extra canonicalize leftover +; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]] + +; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 +; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 + +; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] +; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] +; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]] define half @reduction_maxnum_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> @@ -451,12 +462,24 @@ entry: } ; GCN-LABEL: {{^}}reduction_minnum_v4f16: -; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 +; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 +; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} + +; FIXME: Extra canonicalize leftover +; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]] -; VI: v_min_f16_sdwa -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 + +; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 +; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 + +; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] +; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] +; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]] define half @reduction_minnum_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> @@ -467,13 +490,36 @@ entry: ret half %res } +; FIXME: Need to preserve fast math flags when fmaxnum matched +; directly from the IR to avoid unnecessary quieting. + ; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16: -; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; XGFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; XVI: s_waitcnt +; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; XVI-NEXT: v_max_f16_e32 v0, v0, v1 +; XVI-NEXT: v_max_f16_e32 v0, v0, v2 +; XVI-NEXT: s_setpc_b64 -; VI: v_max_f16_sdwa -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 +; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 +; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} + +; FIXME: Extra canonicalize leftover +; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]] + +; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 +; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 + +; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] +; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] +; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]] define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> @@ -486,13 +532,37 @@ entry: ret half %res } +; FIXME: Need to preserve fast math flags when fmaxnum matched +; directly from the IR to avoid unnecessary quieting. + ; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16: -; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; XGFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; XVI: s_waitcnt +; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; XVI-NEXT: v_min_f16_e32 v0, v0, v1 +; XVI-NEXT: v_min_f16_e32 v0, v0, v2 +; XVI-NEXT: s_setpc_b64 + +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 +; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 +; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} + +; FIXME: Extra canonicalize leftover +; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]] + + +; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 +; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 -; VI: v_min_f16_sdwa -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 +; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] +; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] +; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]] define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> |

