summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2017-05-13 13:42:35 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2017-05-13 13:42:35 +0000
commitef46c2762acc8f0af01b42b2151dba0d6089aa4e (patch)
tree66e412db18b482a0223bd48fbf9dbf3de5c92de5 /llvm/lib
parent7d62e4b455cf181d1657cf8e3be18fd5c0d2eb27 (diff)
downloadbcm5719-llvm-ef46c2762acc8f0af01b42b2151dba0d6089aa4e.tar.gz
bcm5719-llvm-ef46c2762acc8f0af01b42b2151dba0d6089aa4e.zip
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that: vxorps %ymm0, %ymm0, %ymm0 vcmpps $15, %ymm0, %ymm0, %ymm0 is consistently faster (by about 9%) than: vpcmpeqd %xmm0, %xmm0, %xmm0 vinsertf128 $1, %xmm0, %ymm0, %ymm0 Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well. Committed on behalf of @dtemirbulatov Differential Revision: https://reviews.llvm.org/D32416 llvm-svn: 302989
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp14
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td18
2 files changed, 22 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 33c9fcf839e..092ceb207ad 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -7627,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+ case X86::AVX1_SETALLONES: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
+ MIB->setDesc(get(X86::VCMPPSYrri));
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
+ return true;
+ }
case X86::AVX512_512_SETALLONES: {
unsigned Reg = MIB->getOperand(0).getReg();
MIB->setDesc(get(X86::VPTERNLOGDZrri));
@@ -8515,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Alignment = 64;
break;
case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_256_SET0:
Alignment = 32;
@@ -8560,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX2_SETALLONES:
+ case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
@@ -8601,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
- Opc == X86::AVX512_256_SET0)
+ Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
- Opc == X86::AVX512_512_SETALLONES);
+ Opc == X86::AVX512_512_SETALLONES ||
+ Opc == X86::AVX1_SETALLONES);
const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
Constant::getNullValue(Ty);
unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 48da2fa607a..f73d85e7e01 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4i32 immAllOnesV))]>;
+ let Predicates = [HasAVX1Only, OptForMinSize] in {
+ def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
+ [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+ }
let Predicates = [HasAVX2] in
def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllOnesV))]>;
@@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
[]>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
}
-
-// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit
-// all ones value.
-let Predicates = [HasAVX1Only] in
-def : Pat<(v8i32 immAllOnesV),
- (VINSERTF128rr
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm),
- (V_SETALLONES), 1)>;
+// To create a 256-bit all ones value, we should produce VCMPTRUEPS
+// with YMM register containing zero.
+// FIXME: Avoid producing vxorps to clear the fake inputs.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
+}
multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
PatFrag memop_frag> {
OpenPOWER on IntegriCloud