summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
diff options
context:
space:
mode:
authorHal Finkel <hfinkel@anl.gov>2016-03-31 02:56:05 +0000
committerHal Finkel <hfinkel@anl.gov>2016-03-31 02:56:05 +0000
commit851b33a0b1e3c0377f7f8f6262e9ce6a711de235 (patch)
tree40bb9aab0ad5ea295c04d8de1b26200c57bbdbc6 /llvm/lib/Target/PowerPC/PPCISelLowering.cpp
parent8ed5cac97c1c7876fcde080bd58ee4969347a8a7 (diff)
downloadbcm5719-llvm-851b33a0b1e3c0377f7f8f6262e9ce6a711de235.tar.gz
bcm5719-llvm-851b33a0b1e3c0377f7f8f6262e9ce6a711de235.zip
[PowerPC] Load two floats directly instead of using one 64-bit integer load
When dealing with complex<float>, and similar structures with two single-precision floating-point numbers, especially when such things are being passed around by value, we'll sometimes end up loading both float values by extracting them from one 64-bit integer load. It looks like this: t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64 t16: i64 = srl t13, Constant:i32<32> t17: i32 = truncate t16 t18: f32 = bitcast t17 t19: i32 = truncate t13 t20: f32 = bitcast t19 The problem, especially before the P8 where those bitcasts aren't legal (and get expanded via the stack), is that it would have been better to use two floating-point loads directly. Here we add a target-specific DAGCombine to do just that. In short, we turn: ld 3, 0(5) stw 3, -8(1) rldicl 3, 3, 32, 32 stw 3, -4(1) lfs 3, -4(1) lfs 0, -8(1) into: lfs 3, 4(5) lfs 0, 0(5) llvm-svn: 264988
Diffstat (limited to 'llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp105
1 files changed, 105 insertions, 0 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f3251ba8db7..d0f43434c39 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
return expandVSXLoadForLE(N, DCI);
}
+ // We sometimes end up with a 64-bit integer load, from which we extract
+ // two single-precision floating-point numbers. This happens with
+ // std::complex<float>, and other similar structures, because of the way we
+ // canonicalize structure copies. However, if we lack direct moves,
+ // then the final bitcasts from the extracted integer values to the
+ // floating-point numbers turn into store/load pairs. Even with direct moves,
+ // just loading the two floating-point numbers is likely better.
+ auto ReplaceTwoFloatLoad = [&]() {
+ if (VT != MVT::i64)
+ return false;
+
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+ LD->isVolatile())
+ return false;
+
+ // We're looking for a sequence like this:
+ // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
+ // t16: i64 = srl t13, Constant:i32<32>
+ // t17: i32 = truncate t16
+ // t18: f32 = bitcast t17
+ // t19: i32 = truncate t13
+ // t20: f32 = bitcast t19
+
+ if (!LD->hasNUsesOfValue(2, 0))
+ return false;
+
+ auto UI = LD->use_begin();
+ while (UI.getUse().getResNo() != 0) ++UI;
+ SDNode *Trunc = *UI++;
+ while (UI.getUse().getResNo() != 0) ++UI;
+ SDNode *RightShift = *UI;
+ if (Trunc->getOpcode() != ISD::TRUNCATE)
+ std::swap(Trunc, RightShift);
+
+ if (Trunc->getOpcode() != ISD::TRUNCATE ||
+ Trunc->getValueType(0) != MVT::i32 ||
+ !Trunc->hasOneUse())
+ return false;
+ if (RightShift->getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
+ RightShift->getConstantOperandVal(1) != 32 ||
+ !RightShift->hasOneUse())
+ return false;
+
+ SDNode *Trunc2 = *RightShift->use_begin();
+ if (Trunc2->getOpcode() != ISD::TRUNCATE ||
+ Trunc2->getValueType(0) != MVT::i32 ||
+ !Trunc2->hasOneUse())
+ return false;
+
+ SDNode *Bitcast = *Trunc->use_begin();
+ SDNode *Bitcast2 = *Trunc2->use_begin();
+
+ if (Bitcast->getOpcode() != ISD::BITCAST ||
+ Bitcast->getValueType(0) != MVT::f32)
+ return false;
+ if (Bitcast2->getOpcode() != ISD::BITCAST ||
+ Bitcast2->getValueType(0) != MVT::f32)
+ return false;
+
+ if (Subtarget.isLittleEndian())
+ std::swap(Bitcast, Bitcast2);
+
+ // Bitcast has the second float (in memory-layout order) and Bitcast2
+ // has the first one.
+
+ SDValue BasePtr = LD->getBasePtr();
+ if (LD->isIndexed()) {
+ assert(LD->getAddressingMode() == ISD::PRE_INC &&
+ "Non-pre-inc AM on PPC?");
+ BasePtr =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ LD->getOffset());
+ }
+
+ SDValue FloatLoad =
+ DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
+ LD->getPointerInfo(), false, LD->isNonTemporal(),
+ LD->isInvariant(), LD->getAlignment(), LD->getAAInfo());
+ SDValue AddPtr =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+ BasePtr, DAG.getIntPtrConstant(4, dl));
+ SDValue FloatLoad2 =
+ DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
+ LD->getPointerInfo().getWithOffset(4), false,
+ LD->isNonTemporal(), LD->isInvariant(),
+ MinAlign(LD->getAlignment(), 4), LD->getAAInfo());
+
+ if (LD->isIndexed()) {
+ // Note that DAGCombine should re-form any pre-increment load(s) from
+ // what is produced here if that makes sense.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
+ }
+
+ DCI.CombineTo(Bitcast2, FloatLoad);
+ DCI.CombineTo(Bitcast, FloatLoad2);
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
+ SDValue(FloatLoad2.getNode(), 1));
+ return true;
+ };
+
+ if (ReplaceTwoFloatLoad())
+ return SDValue(N, 0);
+
EVT MemVT = LD->getMemoryVT();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
OpenPOWER on IntegriCloud