diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2017-05-13 13:42:35 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2017-05-13 13:42:35 +0000 |
commit | ef46c2762acc8f0af01b42b2151dba0d6089aa4e (patch) | |
tree | 66e412db18b482a0223bd48fbf9dbf3de5c92de5 /llvm/lib | |
parent | 7d62e4b455cf181d1657cf8e3be18fd5c0d2eb27 (diff) | |
download | bcm5719-llvm-ef46c2762acc8f0af01b42b2151dba0d6089aa4e.tar.gz bcm5719-llvm-ef46c2762acc8f0af01b42b2151dba0d6089aa4e.zip |
[x86, SSE] AVX1 PR28129 (256-bit all-ones rematerialization)
Further perf tests on Jaguar indicate that:
vxorps %ymm0, %ymm0, %ymm0
vcmpps $15, %ymm0, %ymm0, %ymm0
is consistently faster (by about 9%) than:
vpcmpeqd %xmm0, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm0
Testing equivalent code on a SandyBridge (E5-2640) puts it slightly (~3%) faster as well.
Committed on behalf of @dtemirbulatov
Differential Revision: https://reviews.llvm.org/D32416
llvm-svn: 302989
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 18 |
2 files changed, 22 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 33c9fcf839e..092ceb207ad 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7627,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::AVX1_SETALLONES: { + unsigned Reg = MIB->getOperand(0).getReg(); + // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. + MIB->setDesc(get(X86::VCMPPSYrri)); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); + return true; + } case X86::AVX512_512_SETALLONES: { unsigned Reg = MIB->getOperand(0).getReg(); MIB->setDesc(get(X86::VPTERNLOGDZrri)); @@ -8515,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Alignment = 64; break; case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: Alignment = 32; @@ -8560,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: @@ -8601,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || - Opc == X86::AVX512_256_SET0) + Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || - Opc == X86::AVX512_512_SETALLONES); + Opc == X86::AVX512_512_SETALLONES || + Opc == X86::AVX1_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 48da2fa607a..f73d85e7e01 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX1Only, OptForMinSize] in { + def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; + } let Predicates = [HasAVX2] in def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllOnesV))]>; @@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } - -// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit -// all ones value. -let Predicates = [HasAVX1Only] in -def : Pat<(v8i32 immAllOnesV), - (VINSERTF128rr - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm), - (V_SETALLONES), 1)>; +// To create a 256-bit all ones value, we should produce VCMPTRUEPS +// with YMM register containing zero. +// FIXME: Avoid producing vxorps to clear the fake inputs. +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; +} multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, PatFrag memop_frag> { |