diff options
| author | Hal Finkel <hfinkel@anl.gov> | 2016-03-31 02:56:05 +0000 |
|---|---|---|
| committer | Hal Finkel <hfinkel@anl.gov> | 2016-03-31 02:56:05 +0000 |
| commit | 851b33a0b1e3c0377f7f8f6262e9ce6a711de235 (patch) | |
| tree | 40bb9aab0ad5ea295c04d8de1b26200c57bbdbc6 /llvm/lib/Target/PowerPC/PPCISelLowering.cpp | |
| parent | 8ed5cac97c1c7876fcde080bd58ee4969347a8a7 (diff) | |
| download | bcm5719-llvm-851b33a0b1e3c0377f7f8f6262e9ce6a711de235.tar.gz bcm5719-llvm-851b33a0b1e3c0377f7f8f6262e9ce6a711de235.zip | |
[PowerPC] Load two floats directly instead of using one 64-bit integer load
When dealing with complex<float>, and similar structures with two
single-precision floating-point numbers, especially when such things are being
passed around by value, we'll sometimes end up loading both float values by
extracting them from one 64-bit integer load. It looks like this:
t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
t16: i64 = srl t13, Constant:i32<32>
t17: i32 = truncate t16
t18: f32 = bitcast t17
t19: i32 = truncate t13
t20: f32 = bitcast t19
The problem, especially before the P8 where those bitcasts aren't legal (and
get expanded via the stack), is that it would have been better to use two
floating-point loads directly. Here we add a target-specific DAGCombine to do
just that. In short, we turn:
ld 3, 0(5)
stw 3, -8(1)
rldicl 3, 3, 32, 32
stw 3, -4(1)
lfs 3, -4(1)
lfs 0, -8(1)
into:
lfs 3, 4(5)
lfs 0, 0(5)
llvm-svn: 264988
Diffstat (limited to 'llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index f3251ba8db7..d0f43434c39 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return expandVSXLoadForLE(N, DCI); } + // We sometimes end up with a 64-bit integer load, from which we extract + // two single-precision floating-point numbers. This happens with + // std::complex<float>, and other similar structures, because of the way we + // canonicalize structure copies. However, if we lack direct moves, + // then the final bitcasts from the extracted integer values to the + // floating-point numbers turn into store/load pairs. Even with direct moves, + // just loading the two floating-point numbers is likely better. + auto ReplaceTwoFloatLoad = [&]() { + if (VT != MVT::i64) + return false; + + if (LD->getExtensionType() != ISD::NON_EXTLOAD || + LD->isVolatile()) + return false; + + // We're looking for a sequence like this: + // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64 + // t16: i64 = srl t13, Constant:i32<32> + // t17: i32 = truncate t16 + // t18: f32 = bitcast t17 + // t19: i32 = truncate t13 + // t20: f32 = bitcast t19 + + if (!LD->hasNUsesOfValue(2, 0)) + return false; + + auto UI = LD->use_begin(); + while (UI.getUse().getResNo() != 0) ++UI; + SDNode *Trunc = *UI++; + while (UI.getUse().getResNo() != 0) ++UI; + SDNode *RightShift = *UI; + if (Trunc->getOpcode() != ISD::TRUNCATE) + std::swap(Trunc, RightShift); + + if (Trunc->getOpcode() != ISD::TRUNCATE || + Trunc->getValueType(0) != MVT::i32 || + !Trunc->hasOneUse()) + return false; + if (RightShift->getOpcode() != ISD::SRL || + !isa<ConstantSDNode>(RightShift->getOperand(1)) || + RightShift->getConstantOperandVal(1) != 32 || + !RightShift->hasOneUse()) + return false; + + SDNode *Trunc2 = *RightShift->use_begin(); + if (Trunc2->getOpcode() != ISD::TRUNCATE || + Trunc2->getValueType(0) != MVT::i32 || + !Trunc2->hasOneUse()) + return false; + + SDNode *Bitcast = *Trunc->use_begin(); + SDNode *Bitcast2 = *Trunc2->use_begin(); + + if (Bitcast->getOpcode() != ISD::BITCAST || + Bitcast->getValueType(0) != MVT::f32) + return false; + if (Bitcast2->getOpcode() != ISD::BITCAST || + Bitcast2->getValueType(0) != MVT::f32) + return false; + + if (Subtarget.isLittleEndian()) + std::swap(Bitcast, Bitcast2); + + // Bitcast has the second float (in memory-layout order) and Bitcast2 + // has the first one. + + SDValue BasePtr = LD->getBasePtr(); + if (LD->isIndexed()) { + assert(LD->getAddressingMode() == ISD::PRE_INC && + "Non-pre-inc AM on PPC?"); + BasePtr = + DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + LD->getOffset()); + } + + SDValue FloatLoad = + DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, + LD->getPointerInfo(), false, LD->isNonTemporal(), + LD->isInvariant(), LD->getAlignment(), LD->getAAInfo()); + SDValue AddPtr = + DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), + BasePtr, DAG.getIntPtrConstant(4, dl)); + SDValue FloatLoad2 = + DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, + LD->getPointerInfo().getWithOffset(4), false, + LD->isNonTemporal(), LD->isInvariant(), + MinAlign(LD->getAlignment(), 4), LD->getAAInfo()); + + if (LD->isIndexed()) { + // Note that DAGCombine should re-form any pre-increment load(s) from + // what is produced here if that makes sense. + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr); + } + + DCI.CombineTo(Bitcast2, FloatLoad); + DCI.CombineTo(Bitcast, FloatLoad2); + + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1), + SDValue(FloatLoad2.getNode(), 1)); + return true; + }; + + if (ReplaceTwoFloatLoad()) + return SDValue(N, 0); + EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); |

