summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2018-06-06 21:28:11 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2018-06-06 21:28:11 +0000
commite9524f1fb33facb8abcdd7604a98590c608d92e9 (patch)
tree66e9920ace9d5e08e863b117e840a39be3e91588
parent29407f3abe1632df8acb8c578501da1165b251bd (diff)
downloadbcm5719-llvm-e9524f1fb33facb8abcdd7604a98590c608d92e9.tar.gz
bcm5719-llvm-e9524f1fb33facb8abcdd7604a98590c608d92e9.zip
AMDGPU: Custom lower v2f16 fneg/fabs with illegal f16
Fixes terrible code on targets without f16 support. The legalization creates a mess that is difficult to recover from. Also should avoid randomly breaking these tests multiple times in sequence in future commits. Some regressions in cases where it happens to be better to pull the source modifier after the conversion. llvm-svn: 334132
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td5
-rw-r--r--llvm/test/CodeGen/AMDGPU/fabs.f16.ll69
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.f16.ll30
5 files changed, 97 insertions, 38 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d95cdcb07f2..6df499c7905 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -531,6 +531,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Legalization hack.
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}
for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
@@ -3700,6 +3703,28 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
return;
}
+ case ISD::FNEG: {
+ SDLoc SL(N);
+ assert(N->getValueType(0) == MVT::v2f16);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x80008000, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
+ case ISD::FABS: {
+ SDLoc SL(N);
+ assert(N->getValueType(0) == MVT::v2f16);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x7fff7fff, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
default:
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 27efb8e7dea..67eae639e5f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -634,6 +634,11 @@ def : GCNPat <
>;
def : GCNPat <
+ (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
+ (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : GCNPat <
(f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
(V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index d3e4afc8e83..3f1ffc7b70b 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -28,16 +28,9 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
ret void
}
-; FIXME: Should be able to use single and
; GCN-LABEL: {{^}}s_fabs_v2f16:
-; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
-; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
-; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
-; CI: v_or_b32_e32
-
-; GFX89: s_load_dword [[VAL:s[0-9]+]]
-; GFX89: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -45,18 +38,11 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
}
; GCN-LABEL: {{^}}s_fabs_v4f16:
-; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
-; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
-; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
-; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
-; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
-
-
-; GFX89: s_load_dword s
-; GFX89: s_load_dword s
-; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
-; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
-; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; GCN: {{flat|global}}_store_dwordx2
define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
@@ -108,14 +94,19 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
ret void
}
-; GCN-LABEL: {{^}}v_fabs_fold_v2f16:
+; FIXME: Should do fabs after conversion to avoid converting multiple
+; times in this particular case.
+
+; GCN-LABEL: {{^}}v_fabs_fold_self_v2f16:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI: v_cvt_f32_f16_e32
; CI: v_cvt_f32_f16_e32
-; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_cvt_f16_f32
-; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_cvt_f16_f32
; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -123,7 +114,7 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
-define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fabs_fold_self_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
%val = load <2 x half>, <2 x half> addrspace(1)* %gep
@@ -133,6 +124,34 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
ret void
}
+; GCN-LABEL: {{^}}v_fabs_fold_v2f16:
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; CI: v_cvt_f32_f16_e32
+; CI: v_cvt_f32_f16_e32
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+
+; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, s{{[0-9]+}}
+
+; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], s{{[0-9]+$}}
+define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %other.val) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+ %val = load <2 x half>, <2 x half> addrspace(1)* %gep
+ %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
+ %other.val.cvt = bitcast i32 %other.val to <2 x half>
+ %fmul = fmul <2 x half> %fabs, %other.val.cvt
+ store <2 x half> %fmul, <2 x half> addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: {{^}}v_extract_fabs_fold_v2f16:
; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d7141efc82d..bb6a1643ac0 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -123,8 +123,10 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h
}
; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
-; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: s_load_dword s
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index b4f8bb98cd7..34f464ac4d9 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -60,17 +60,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspa
ret void
}
-; FIXME: Terrible code with SI/CI.
; FIXME: scalar for VI, vector for gfx9
; GCN-LABEL: {{^}}s_fneg_v2f16:
-; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; CI: v_or_b32_e32
-
-; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
-
+; CIVI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
@@ -78,6 +70,18 @@ define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half
ret void
}
+; FIXME: vector on gfx9
+; GCN-LABEL: {{^}}s_fneg_v2f16_nonload:
+; CIVI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
+; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+define amdgpu_kernel void @s_fneg_v2f16_nonload(<2 x half> addrspace(1)* %out) #0 {
+ %in = call i32 asm sideeffect "; def $0", "=s"()
+ %in.bc = bitcast i32 %in to <2 x half>
+ %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in.bc
+ store <2 x half> %fneg, <2 x half> addrspace(1)* %out
+ ret void
+}
+
; GCN-LABEL: {{^}}v_fneg_v2f16:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]
@@ -107,8 +111,12 @@ define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i
; GCN-LABEL: {{^}}v_fneg_fold_v2f16:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
-; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
+; CI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, [[VAL]]
+; CI: v_lshrrev_b32_e32
+; CI: v_lshrrev_b32_e32
+
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_cvt_f16_f32
; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
OpenPOWER on IntegriCloud