summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2016-09-17 15:44:16 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2016-09-17 15:44:16 +0000
commitd99ef1144b38f41ca2e68bf666490110237ec2bf (patch)
tree12d8684a46294d1baefee1692c486464eccde361
parent0baa19004f5e2ec830d94256825c715f708208a8 (diff)
downloadbcm5719-llvm-d99ef1144b38f41ca2e68bf666490110237ec2bf.tar.gz
bcm5719-llvm-d99ef1144b38f41ca2e68bf666490110237ec2bf.zip
AMDGPU: Push bitcasts through build_vector
This reduces the number of copies and reg_sequences when using fp constant vectors. This significantly reduces the code size in local-stack-alloc-bug.ll llvm-svn: 281822
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp27
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll69
2 files changed, 96 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 22871e64a3e..e214164facb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2548,6 +2548,33 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
break;
case ISD::BITCAST: {
EVT DestVT = N->getValueType(0);
+
+ // Push casts through vector builds. This helps avoid emitting a large
+ // number of copies when materializing floating point vector constants.
+ //
+ // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
+ // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
+ if (DestVT.isVector()) {
+ SDValue Src = N->getOperand(0);
+ if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ EVT SrcVT = Src.getValueType();
+ unsigned NElts = DestVT.getVectorNumElements();
+
+ if (SrcVT.getVectorNumElements() == NElts) {
+ EVT DestEltVT = DestVT.getVectorElementType();
+
+ SmallVector<SDValue, 8> CastedElts;
+ SDLoc SL(N);
+ for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
+ SDValue Elt = Src.getOperand(I);
+ CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
+ }
+
+ return DAG.getBuildVector(DestVT, SL, CastedElts);
+ }
+ }
+ }
+
if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
break;
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
new file mode 100644
index 00000000000..2482fa761b1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -0,0 +1,69 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; The bitcast should be pushed through the bitcasts so the vectors can
+; be broken down and the shared components can be CSEd
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+ %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
+ store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+
+ %vec1.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 9> to <8 x float>
+ store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+ %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
+ store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+
+ %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <8 x float>
+ store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+ %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
+ store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
+
+ %vec1.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 9> to <4 x double>
+ store volatile <4 x double> %vec1.bc, <4 x double> addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+; GCN-NOT: v_mov_b32
+; GCN: buffer_store_dwordx4
+define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+ %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
+ store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
+
+ %vec1.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 9> to <8 x float>
+ store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
+ ret void
+}
OpenPOWER on IntegriCloud