summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp16
-rw-r--r--llvm/test/CodeGen/AMDGPU/half.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll93
3 files changed, 111 insertions, 3 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4120e79e667..cde38e479ac 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7176,6 +7176,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
}
}
+ // Fold truncate of a bitcast of a vector to an extract of the low vector
+ // element.
+ //
+ // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0
+ if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
+ SDValue VecSrc = N0.getOperand(0);
+ EVT SrcVT = VecSrc.getValueType();
+ if (SrcVT.isVector() && SrcVT.getScalarType() == VT) {
+ SDLoc SL(N);
+
+ EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
+ VecSrc, DAG.getConstant(0, SL, IdxVT));
+ }
+ }
+
// Simplify the operands using demanded-bits information.
if (!VT.isVector() &&
SimplifyDemandedBits(SDValue(N, 0)))
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 76fbc6a5eae..409b2ec8a29 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -396,12 +396,11 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
-
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
+; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
; GCN: v_cvt_f32_f16_e32
-; GCN-NOT: v_cvt_f32_f16_e32
+; GCN-NOT: v_cvt_f32_f16
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
new file mode 100644
index 00000000000..20c49e2809c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+ %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+ %bc = bitcast <2 x i32> %ld to i64
+ %trunc = trunc i64 %bc to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+ %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
+ %bc = bitcast <3 x i32> %ld to i96
+ %trunc = trunc i96 %bc to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+ %bc = bitcast <4 x i32> %ld to i128
+ %trunc = trunc i128 %bc to i32
+ store i32 %trunc, i32 addrspace(1)* %out
+ ret void
+}
+
+; Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+ %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
+ %bc = bitcast <2 x i16> %ld to i32
+ %trunc = trunc i32 %bc to i16
+ store i16 %trunc, i16 addrspace(1)* %out
+ ret void
+}
+
+; FIXME: Don't want load width reduced here.
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16:
+; CHECK: buffer_load_ushort [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+ %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
+ %bc = bitcast <4 x i16> %ld to i64
+ %trunc = trunc i64 %bc to i16
+ store i16 %trunc, i16 addrspace(1)* %out
+ ret void
+}
+
+; FIXME: Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
+; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+ %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+ %bc = bitcast <2 x i8> %ld to i16
+ %trunc = trunc i16 %bc to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+ %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
+ %bc = bitcast <4 x i8> %ld to i32
+ %trunc = trunc i32 %bc to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+ %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+ %bc = bitcast <3 x i8> %ld to i24
+ %trunc = trunc i24 %bc to i8
+ store i8 %trunc, i8 addrspace(1)* %out
+ ret void
+}
OpenPOWER on IntegriCloud