summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp31
-rw-r--r--llvm/test/CodeGen/X86/avx-logic.ll18
3 files changed, 44 insertions, 7 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0a3744ebdcc..00c6c446d27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8205,7 +8205,7 @@ SDValue llvm::peekThroughOneUseBitcasts(SDValue V) {
bool llvm::isBitwiseNot(SDValue V) {
if (V.getOpcode() != ISD::XOR)
return false;
- ConstantSDNode *C = isConstOrConstSplat(V.getOperand(1));
+ ConstantSDNode *C = isConstOrConstSplat(peekThroughBitcasts(V.getOperand(1)));
return C && C->isAllOnesValue();
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 54e9098ea54..7fc37718409 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40168,6 +40168,37 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ // For AVX1 only, if we are extracting from a 256-bit and+not (which will
+ // eventually get combined/lowered into ANDNP) with a concatenated operand,
+ // split the 'and' into 128-bit ops to avoid the concatenate and extract.
+ // We let generic combining take over from there to simplify the
+ // insert/extract and 'not'.
+ // This pattern emerges during AVX1 legalization. We handle it before lowering
+ // to avoid complications like splitting constant vector loads.
+
+ // Capture the original wide type in the likely case that we need to bitcast
+ // back to this type.
+ EVT VT = N->getValueType(0);
+ EVT WideVecVT = N->getOperand(0).getValueType();
+ SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
+ if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && WideVecVT.isSimple() &&
+ WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
+ auto isConcatenatedNot = [] (SDValue V) {
+ V = peekThroughBitcasts(V);
+ if (!isBitwiseNot(V))
+ return false;
+ SDValue NotOp = V->getOperand(0);
+ return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
+ };
+ if (isConcatenatedNot(WideVec.getOperand(0)) ||
+ isConcatenatedNot(WideVec.getOperand(1))) {
+ // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
+ SDValue Concat = split256IntArith(WideVec, DAG);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
+ DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
+ }
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll
index 379a9751c2f..0fe5cbacc84 100644
--- a/llvm/test/CodeGen/X86/avx-logic.ll
+++ b/llvm/test/CodeGen/X86/avx-logic.ll
@@ -342,9 +342,9 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
@@ -364,6 +364,8 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
ret <8 x i32> %add1
}
+; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable.
+
define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
; AVX1-LABEL: andn_variable_mask_operand_no_concat:
; AVX1: # %bb.0:
@@ -386,6 +388,8 @@ define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %
ret <8 x i32> %add
}
+; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable (even if the mask is a constant).
+
define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y) {
; AVX1-LABEL: andn_constant_mask_operand_no_concat:
; AVX1: # %bb.0:
@@ -408,6 +412,8 @@ define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %
ret <8 x i32> %r
}
+; This is a close call, but we split the 'andn' to reduce the insert/extract.
+
define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
; AVX1-LABEL: andn_variable_mask_operand_concat:
; AVX1: # %bb.0:
@@ -415,9 +421,9 @@ define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y,
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
OpenPOWER on IntegriCloud