3 files changed, 111 insertions, 3 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4120e79e667..cde38e479ac 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7176,6 +7176,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     }
   }
 
+  // Fold truncate of a bitcast of a vector to an extract of the low vector
+  // element.
+  //
+  // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0
+  if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
+    SDValue VecSrc = N0.getOperand(0);
+    EVT SrcVT = VecSrc.getValueType();
+    if (SrcVT.isVector() && SrcVT.getScalarType() == VT) {
+      SDLoc SL(N);
+
+      EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
+                         VecSrc, DAG.getConstant(0, SL, IdxVT));
+    }
+  }
+
   // Simplify the operands using demanded-bits information.
   if (!VT.isVector() &&
       SimplifyDemandedBits(SDValue(N, 0)))
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 76fbc6a5eae..409b2ec8a29 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -396,12 +396,11 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
 
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
-
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
+; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
 ; GCN: v_cvt_f32_f16_e32
-; GCN-NOT: v_cvt_f32_f16_e32
+; GCN-NOT: v_cvt_f32_f16
 
 ; GCN: v_cvt_f64_f32_e32
 ; GCN: v_cvt_f64_f32_e32
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
new file mode 100644
index 00000000000..20c49e2809c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %bc = bitcast <2 x i32> %ld to i64
+  %trunc = trunc i64 %bc to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+  %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
+  %bc = bitcast <3 x i32> %ld to i96
+  %trunc = trunc i96 %bc to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
+; CHECK: buffer_load_dword v
+; CHECK: buffer_store_dword v
+define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %bc = bitcast <4 x i32> %ld to i128
+  %trunc = trunc i128 %bc to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+  %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %bc = bitcast <2 x i16> %ld to i32
+  %trunc = trunc i32 %bc to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Don't want load width reduced here.
+; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16:
+; CHECK: buffer_load_ushort [[VAL:v[0-9]+]]
+; CHECK: buffer_store_short [[VAL]]
+define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+  %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %bc = bitcast <4 x i16> %ld to i64
+  %trunc = trunc i64 %bc to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Don't want load width reduced in this case.
+; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
+; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+  %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %bc = bitcast <2 x i8> %ld to i16
+  %trunc = trunc i16 %bc to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+  %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %bc = bitcast <4 x i8> %ld to i32
+  %trunc = trunc i32 %bc to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: buffer_store_byte [[VAL]]
+define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+  %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
+  %bc = bitcast <3 x i8> %ld to i24
+  %trunc = trunc i24 %bc to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}