summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2016-04-12 18:24:38 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2016-04-12 18:24:38 +0000
commit3b08238f7878988a8e2737cdab42fcc334c8547f (patch)
tree207ef8f31d957df8494a1546b119aadb761263e6
parent15d1b4e2aab9159a8cc56b49eb68f1a82e4ebe90 (diff)
downloadbcm5719-llvm-3b08238f7878988a8e2737cdab42fcc334c8547f.tar.gz
bcm5719-llvm-3b08238f7878988a8e2737cdab42fcc334c8547f.zip
AMDGPU: Eliminate half of i64 or if one operand is zero_extend from i32
This helps clean up some of the mess when expanding unaligned 64-bit loads when changed to be promote to v2i32, and fixes situations where or x, 0 was emitted after splitting 64-bit ors during moveToVALU. I think this could be a generic combine but I'm not sure. llvm-svn: 266104
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp30
-rw-r--r--llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll41
2 files changed, 71 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cb490921395..64975cf1809 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2332,6 +2332,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::i64) {
+ // TODO: This could be a generic combine with a predicate for extracting the
+ // high half of an integer being free.
+
+ // (or i64:x, (zero_extend i32:y)) ->
+ // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+ if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LHS, RHS);
+
+ if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue ExtSrc = RHS.getOperand(0);
+ EVT SrcVT = ExtSrc.getValueType();
+ if (SrcVT == MVT::i32) {
+ SDLoc SL(N);
+ SDValue LowLHS, HiBits;
+ std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+ SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+ DCI.AddToWorklist(LowOr.getNode());
+ DCI.AddToWorklist(HiBits.getNode());
+
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ LowOr, HiBits);
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+ }
+ }
+ }
+
// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
diff --git a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
new file mode 100644
index 00000000000..842c30b40df
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}zext_or_operand_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dword v[[LD32:[0-9]+]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+ %ld.64 = load volatile i64, i64 addrspace(1)* %in0
+ %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+ %ext = zext i32 %ld.32 to i64
+ %or = or i64 %ld.64, %ext
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}zext_or_operand_commute_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dword v[[LD32:[0-9]+]]
+; GCN-NOT: _or_
+; GCN-NOT: v[[HI]]
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]]
+; GCN-NOT: v[[HI]]
+; GCN-NOT: _or_
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+ %ld.64 = load volatile i64, i64 addrspace(1)* %in0
+ %ld.32 = load volatile i32, i32 addrspace(1)* %in1
+ %ext = zext i32 %ld.32 to i64
+ %or = or i64 %ext, %ld.64
+ store i64 %or, i64 addrspace(1)* %out
+ ret void
+}
OpenPOWER on IntegriCloud