summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2019-05-02 14:00:55 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2019-05-02 14:00:55 +0000
commitdf8daf0ef4d1d00436ea1040355bb08974c5fd66 (patch)
tree53f06514438dc635f39cb3e3ce062c0a7c01c0d7 /llvm/lib/Target
parenta4939d35070f569d4ad3e6161ea382d348d0e72a (diff)
downloadbcm5719-llvm-df8daf0ef4d1d00436ea1040355bb08974c5fd66.tar.gz
bcm5719-llvm-df8daf0ef4d1d00436ea1040355bb08974c5fd66.zip
[X86][SSE] lowerAddSubToHorizontalOp - enable ymm extraction+fold
Limiting scalar hadd/hsub generation to the lowest xmm looks to be unnecessary - we will be extracting one upper xmm whatever, and we can remove a shuffle by using the hop which is inline with what shouldUseHorizontalOp expects to happen anyway. Testing on btver2 (the main target for fast-hops) shows this is beneficial even for float ops where we have a 'shuffle' to extract the float result: https://godbolt.org/z/0R-U-K Differential Revision: https://reviews.llvm.org/D61426 llvm-svn: 359786
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp11
1 files changed, 5 insertions, 6 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 660f81b831c..1d28ccf973e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19037,13 +19037,12 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
- // This is free if we're extracting from the bottom lane: ymm/zmm -> xmm.
- if (NumEltsPerLane <= LExtIndex)
- return Op;
-
SDLoc DL(Op);
- if (BitWidth == 256 || BitWidth == 512)
- X = extract128BitVector(X, 0, DAG, DL);
+ if (BitWidth == 256 || BitWidth == 512) {
+ unsigned LaneIdx = LExtIndex / NumEltsPerLane;
+ X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
+ LExtIndex %= NumEltsPerLane;
+ }
// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
OpenPOWER on IntegriCloud