diff options
author | Clement Courbet <courbet@google.com> | 2017-04-21 09:20:39 +0000 |
---|---|---|
committer | Clement Courbet <courbet@google.com> | 2017-04-21 09:20:39 +0000 |
commit | 1ce3b82dea8eb35e77974fc9d97f9a08c690c53d (patch) | |
tree | 10718c7c21b90322462a789d671de895ccb18d54 | |
parent | f8a964252643c4e65d0c091105cc9d4cbe813690 (diff) | |
download | bcm5719-llvm-1ce3b82dea8eb35e77974fc9d97f9a08c690c53d.tar.gz bcm5719-llvm-1ce3b82dea8eb35e77974fc9d97f9a08c690c53d.zip |
X86 memcpy: use REPMOVSB instead of REPMOVS{Q,D,W} for inline copies
when the subtarget has fast strings.
This has two advantages:
- Speed is improved. For example, on Haswell thoughput improvements increase
linearly with size from 256 to 512 bytes, after which they plateau:
(e.g. 1% for 260 bytes, 25% for 400 bytes, 40% for 508 bytes).
- Code is much smaller (no need to handle boundaries).
llvm-svn: 300957
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SelectionDAGInfo.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/memcpy-struct-by-value.ll | 15 |
6 files changed, 35 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 8fcc8e31d5d..99d6c6932ae 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -273,6 +273,13 @@ def FeatureFastSHLDRotate "fast-shld-rotate", "HasFastSHLDRotate", "true", "SHLD can be used as a faster rotate">; +// String operations (e.g. REP MOVS) are fast. See "REP String Enhancement" in +// the Intel Software Development Manual. +def FeatureFastString + : SubtargetFeature< + "fast-string", "HasFastString", "true", + "REP MOVS/STOS are fast">; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// @@ -498,6 +505,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [ FeatureAVX2, FeatureBMI, FeatureBMI2, + FeatureFastString, FeatureFMA, FeatureLZCNT, FeatureMOVBE, diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index e31d2769047..456a204bafa 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -897,6 +897,7 @@ def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; +def HasFastString : Predicate<"Subtarget->hasFastString()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index 2ab4ecbc052..d893738d068 100644 --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -215,7 +215,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( return SDValue(); MVT AVT; - if (Align & 1) + if (Subtarget.hasFastString()) + // If the target has fast strings, then it's at least as fast to use + // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle + // BytesLeft. + AVT = MVT::i8; + else if (Align & 1) AVT = MVT::i8; else if (Align & 2) AVT = MVT::i16; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 92a68759195..0bd29a5a27d 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -303,6 +303,7 @@ void X86Subtarget::initializeEnvironment() { HasFastVectorFSQRT = false; HasFastLZCNT = false; HasFastSHLDRotate = false; + HasFastString = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index d0d88d32694..2b858c28e04 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -232,6 +232,9 @@ protected: /// True if SHLD based rotate is fast. bool HasFastSHLDRotate; + /// True if the processor has fast REP MOVS. + bool HasFastString; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -472,6 +475,7 @@ public: bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } + bool hasFastString() const { return HasFastString; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } diff --git a/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll new file mode 100644 index 00000000000..4bb022e9332 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST + +%struct.large = type { [4096 x i8] } + +declare void @foo(%struct.large* align 8 byval) nounwind + +define void @test1(%struct.large* nocapture %x) nounwind { + call void @foo(%struct.large* align 8 byval %x) + ret void + +; ALL-LABEL: test1: +; NOFAST: rep;movsq +; FAST: rep;movsb +} |