summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2018-06-20 14:26:28 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2018-06-20 14:26:28 +0000
commit2e2f20a94926606269626a405c0fba54cf0f7ed1 (patch)
treeae8a718565e8238f499cc1ab3326d9545d33f63d
parent0c57de4c214a2b61811e247365a830cb70683a82 (diff)
downloadbcm5719-llvm-2e2f20a94926606269626a405c0fba54cf0f7ed1.tar.gz
bcm5719-llvm-2e2f20a94926606269626a405c0fba54cf0f7ed1.zip
[SLPVectorizer] Relax "alternate" opcode vectorisation to work with any SK_Select shuffle pattern
D47985 saw the old SK_Alternate 'alternating' shuffle mask replaced with the SK_Select mask which accepts either input operand for each lane, equivalent to a vector select with a constant condition operand. This patch updates SLPVectorizer to make full use of this SK_Select shuffle pattern by removing the 'isOdd()' limitation. The AArch64 regression will be fixed by D48172. Differential Revision: https://reviews.llvm.org/D48174 llvm-svn: 335130
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp22
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll44
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/addsub.ll26
3 files changed, 47 insertions, 45 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index dfa881aaa10..86e94f6d202 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -316,10 +316,6 @@ static unsigned getAltOpcode(unsigned Op) {
}
}
-static bool isOdd(unsigned Value) {
- return Value & 1;
-}
-
static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
unsigned CheckedOpcode) {
return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
@@ -378,7 +374,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
unsigned AltOpcode = getAltOpcode(Opcode);
for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
- if (InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode))
+ if (!sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode))
return InstructionsState(VL[0], 0, false);
}
}
@@ -3510,22 +3506,26 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// Create shuffle to take alternate operations from the vector.
// Also, gather up odd and even scalar ops to propagate IR flags to
// each vector operation.
- ValueList OddScalars, EvenScalars;
+ ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
- if (isOdd(i)) {
+ auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ unsigned InstOpcode = OpInst->getOpcode();
+ assert(sameOpcodeOrAlt(S.Opcode, AltOpcode, InstOpcode) &&
+ "Unexpected main/alternate opcode");
+ if (InstOpcode == AltOpcode) {
Mask[i] = Builder.getInt32(e + i);
- OddScalars.push_back(E->Scalars[i]);
+ AltScalars.push_back(E->Scalars[i]);
} else {
Mask[i] = Builder.getInt32(i);
- EvenScalars.push_back(E->Scalars[i]);
+ OpScalars.push_back(E->Scalars[i]);
}
}
Value *ShuffleMask = ConstantVector::get(Mask);
- propagateIRFlags(V0, EvenScalars);
- propagateIRFlags(V1, OddScalars);
+ propagateIRFlags(V0, OpScalars);
+ propagateIRFlags(V1, AltScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
if (Instruction *I = dyn_cast<Instruction>(V))
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 0a6efb075cb..b78ae25c2f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -270,22 +270,34 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]]
; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]]
; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]]
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP0_0]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP0_2]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1_2]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP0_1]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_3]], i32 2
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1_3]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], <i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], <i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP14]])
-; CHECK-NEXT: ret i32 [[TMP15]]
+; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]]
+; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]]
+; CHECK-NEXT: [[TMP3_0:%.*]] = lshr i32 [[TMP2_0]], 15
+; CHECK-NEXT: [[TMP3_1:%.*]] = lshr i32 [[TMP2_1]], 15
+; CHECK-NEXT: [[TMP3_2:%.*]] = lshr i32 [[TMP2_2]], 15
+; CHECK-NEXT: [[TMP3_3:%.*]] = lshr i32 [[TMP2_3]], 15
+; CHECK-NEXT: [[TMP4_0:%.*]] = and i32 [[TMP3_0]], 65537
+; CHECK-NEXT: [[TMP4_1:%.*]] = and i32 [[TMP3_1]], 65537
+; CHECK-NEXT: [[TMP4_2:%.*]] = and i32 [[TMP3_2]], 65537
+; CHECK-NEXT: [[TMP4_3:%.*]] = and i32 [[TMP3_3]], 65537
+; CHECK-NEXT: [[TMP5_0:%.*]] = mul nuw i32 [[TMP4_0]], 65535
+; CHECK-NEXT: [[TMP5_1:%.*]] = mul nuw i32 [[TMP4_1]], 65535
+; CHECK-NEXT: [[TMP5_2:%.*]] = mul nuw i32 [[TMP4_2]], 65535
+; CHECK-NEXT: [[TMP5_3:%.*]] = mul nuw i32 [[TMP4_3]], 65535
+; CHECK-NEXT: [[TMP6_0:%.*]] = add i32 [[TMP5_0]], [[TMP2_0]]
+; CHECK-NEXT: [[TMP6_1:%.*]] = add i32 [[TMP5_1]], [[TMP2_1]]
+; CHECK-NEXT: [[TMP6_2:%.*]] = add i32 [[TMP5_2]], [[TMP2_2]]
+; CHECK-NEXT: [[TMP6_3:%.*]] = add i32 [[TMP5_3]], [[TMP2_3]]
+; CHECK-NEXT: [[TMP7_0:%.*]] = xor i32 [[TMP6_0]], [[TMP5_0]]
+; CHECK-NEXT: [[TMP7_1:%.*]] = xor i32 [[TMP6_1]], [[TMP5_1]]
+; CHECK-NEXT: [[TMP7_2:%.*]] = xor i32 [[TMP6_2]], [[TMP5_2]]
+; CHECK-NEXT: [[TMP7_3:%.*]] = xor i32 [[TMP6_3]], [[TMP5_3]]
+; CHECK-NEXT: [[REDUCE_0:%.*]] = add i32 [[TMP7_1]], [[TMP7_0]]
+; CHECK-NEXT: [[REDUCE_1:%.*]] = add i32 [[REDUCE_0]], [[TMP7_2]]
+; CHECK-NEXT: [[REDUCE_2:%.*]] = add i32 [[REDUCE_1]], [[TMP7_3]]
+; CHECK-NEXT: ret i32 [[REDUCE_2]]
;
%v0.0 = extractelement <4 x i32> %v0, i32 0
%v0.1 = extractelement <4 x i32> %v0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 3ef4bdea7f3..510bc9e5b20 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -182,25 +182,15 @@ entry:
}
; Function Attrs: nounwind uwtable
-define void @No_faddfsub() #0 {
-; CHECK-LABEL: @No_faddfsub(
+define void @faddfsub_select() #0 {
+; CHECK-LABEL: @faddfsub_select(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-NEXT: store float [[ADD]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
-; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[TMP2]], [[TMP3]]
-; CHECK-NEXT: store float [[ADD1]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
-; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[TMP4]], [[TMP5]]
-; CHECK-NEXT: store float [[ADD2]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
-; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
-; CHECK-NEXT: [[SUB:%.*]] = fsub float [[TMP6]], [[TMP7]]
-; CHECK-NEXT: store float [[SUB]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
; CHECK-NEXT: ret void
;
entry:
OpenPOWER on IntegriCloud