summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Christopher <echristo@gmail.com>2019-05-07 19:25:34 +0000
committerEric Christopher <echristo@gmail.com>2019-05-07 19:25:34 +0000
commit4727221734403b86d5bb6385fee7e7fec6fa52ff (patch)
tree75956f59c6eb8f857f2bd63ffcc5f4d225aeecbd
parentf3e81aee0b36d9b00e46d6b30d50672702bbd631 (diff)
downloadbcm5719-llvm-4727221734403b86d5bb6385fee7e7fec6fa52ff.tar.gz
bcm5719-llvm-4727221734403b86d5bb6385fee7e7fec6fa52ff.zip
Make sure that the DAG combiner doesn't merge stores that we explicitly
asked not be greater than preferred vector width for the vectorizer. Test for both 128 and 256 with a skylake architecture. llvm-svn: 360183
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp24
-rw-r--r--llvm/test/CodeGen/X86/vector-width-store-merge.ll53
2 files changed, 68 insertions, 9 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dca81e576ad..56f02c412ae 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2058,18 +2058,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
-EVT
-X86TargetLowering::getOptimalMemOpType(
+/// For vector ops we check that the overall size isn't larger than our
+/// preferred vector width.
+EVT X86TargetLowering::getOptimalMemOpType(
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const {
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
- if (Size >= 16 &&
- (!Subtarget.isUnalignedMem16Slow() ||
- ((DstAlign == 0 || DstAlign >= 16) &&
- (SrcAlign == 0 || SrcAlign >= 16)))) {
+ if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
// FIXME: Check if unaligned 32-byte accesses are slow.
- if (Size >= 32 && Subtarget.hasAVX()) {
+ if (Size >= 32 && Subtarget.hasAVX() &&
+ (Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
@@ -2077,11 +2078,12 @@ X86TargetLowering::getOptimalMemOpType(
// multiply) before we splat as a vector.
return MVT::v32i8;
}
- if (Subtarget.hasSSE2())
+ if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
- if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
+ if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
+ (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
@@ -4963,6 +4965,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
}
+ // Make sure we don't merge greater than our preferred vector
+ // width.
+ if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
+ return false;
return true;
}
diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
new file mode 100644
index 00000000000..e316e08ebe1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+; This tests whether or not we generate vectors large than preferred vector width when
+; lowering memmove.
+
+; Function Attrs: nounwind uwtable
+define weak_odr dso_local void @A(i8* %src, i8* %dst) local_unnamed_addr #0 {
+entry:
+; CHECK: A
+; CHECK-NOT: vmovups %ymm
+; CHECK: vmovups %xmm
+ call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define weak_odr dso_local void @B(i8* %src, i8* %dst) local_unnamed_addr #0 {
+entry:
+; CHECK: B
+; CHECK-NOT: vmovups %zmm
+; CHECK: vmovups %xmm
+ call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define weak_odr dso_local void @C(i8* %src, i8* %dst) local_unnamed_addr #2 {
+entry:
+; CHECK: C
+; CHECK-NOT: vmovups %ymm
+; CHECK: vmovups %ymm
+ call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define weak_odr dso_local void @D(i8* %src, i8* %dst) local_unnamed_addr #2 {
+entry:
+; CHECK: D
+; CHECK-NOT: vmovups %zmm
+; CHECK: vmovups %ymm
+ call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false)
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{i32 1, !"wchar_size", i32 4}
OpenPOWER on IntegriCloud