diff options
| author | Andrew V. Tischenko <andrew.v.tischenko@gmail.com> | 2017-06-08 16:44:13 +0000 |
|---|---|---|
| committer | Andrew V. Tischenko <andrew.v.tischenko@gmail.com> | 2017-06-08 16:44:13 +0000 |
| commit | 8cb1d0931f2749a9731e1a59f869a509baa83640 (patch) | |
| tree | 28e081c49239cbe3910dcc6dddeb79527f074d10 | |
| parent | c41b67cc52b0059edcdf6133f929fab02b775d9a (diff) | |
| download | bcm5719-llvm-8cb1d0931f2749a9731e1a59f869a509baa83640.tar.gz bcm5719-llvm-8cb1d0931f2749a9731e1a59f869a509baa83640.zip | |
Add scheduler classes to integer/float horizontal operations.
This patch will close PR32801.
Differential Revision: https://reviews.llvm.org/D33203
llvm-svn: 304986
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedHaswell.td | 33 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedSandyBridge.td | 25 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Schedule.td | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBtVer2.td | 32 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleSLM.td | 27 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-schedule.ll | 32 |
7 files changed, 142 insertions, 21 deletions
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index a3e67720930..8490b972eb5 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5183,14 +5183,14 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, - Sched<[WriteFAdd]>; + Sched<[WriteFHAdd]>; def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], - IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; + IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>; } multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, @@ -5200,14 +5200,14 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, - Sched<[WriteFAdd]>; + Sched<[WriteFHAdd]>; def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], - IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; + IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { @@ -5310,7 +5310,7 @@ defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>; // SSSE3 - Packed Binary Operator Instructions //===---------------------------------------------------------------------===// -let Sched = WriteVecALU in { +let Sched = WritePHAdd in { def SSE_PHADDSUBD : OpndItins< IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM >; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 677e8245976..03c8ccb53af 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -1488,6 +1488,39 @@ def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>; //-- Arithmetic instructions --// +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} + +// x,m / v,v,m. +def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +// v <- v,m. +def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 2, 1]; +} + // PHADD|PHSUB (S) W/D. // v <- v,v. def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> { diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index eca65c2892b..b8ec5883152 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -157,6 +157,31 @@ def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> { let ResourceCycles = [1, 1, 1, 1]; } +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes<WriteFHAdd, [SBPort1]> { + let Latency = 3; +} + +// x,m / v,v,m. +def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes<WritePHAdd, [SBPort15]>; + +// v <- v,m. +def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> { + let Latency = 5; + let ResourceCycles = [1, 1]; +} + // String instructions. // Packed Compare Implicit Length Strings, Return Mask def : WriteRes<WritePCmpIStrM, [SBPort015]> { diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 4eae6ca7abe..a12fa68faf4 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -77,6 +77,10 @@ defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends. // FMA Scheduling helper class. class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } +// Horizontal Add/Sub (float and integer) +defm WriteFHAdd : X86SchedWritePair; +defm WritePHAdd : X86SchedWritePair; + // Vector integer operations. defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index ce1ece34e43..6cb2a3694d9 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -320,6 +320,38 @@ def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> { } //////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteFHAdd, [JFPU0]> { + let Latency = 3; +} + +def : WriteRes<WriteFHAddLd, [JLAGU, JFPU0]> { + let Latency = 8; +} + +def : WriteRes<WritePHAdd, [JFPU01]> { + let ResourceCycles = [1]; +} +def : WriteRes<WritePHAddLd, [JLAGU, JFPU01 ]> { + let Latency = 6; + let ResourceCycles = [1, 1]; +} + +def WriteFHAddY: SchedWriteRes<[JFPU0]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : InstRW<[WriteFHAddY], (instregex "VH(ADD|SUB)P(S|D)Yrr")>; + +def WriteFHAddYLd: SchedWriteRes<[JLAGU, JFPU0]> { + let Latency = 8; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteFHAddYLd], (instregex "VH(ADD|SUB)P(S|D)Yrm")>; + +//////////////////////////////////////////////////////////////////////////////// // Carry-less multiplication instructions. //////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index f95d4fa0417..03ed2db2350 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -137,6 +137,33 @@ defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>; defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>; defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>; +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +// HADD, HSUB PS/PD + +def : WriteRes<WriteFHAdd, [FPC_RSV01]> { + let Latency = 3; + let ResourceCycles = [2]; +} + +def : WriteRes<WriteFHAddLd, [FPC_RSV01, MEC_RSV]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +// PHADD|PHSUB (S) W/D. +def : WriteRes<WritePHAdd, [FPC_RSV01]> { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : WriteRes<WritePHAddLd, [FPC_RSV01, MEC_RSV]> { + let Latency = 4; + let ResourceCycles = [1, 1]; +} + // String instructions. // Packed Compare Implicit Length Strings, Return Mask def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> { diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll index bb05481e313..47e95fe31bd 100644 --- a/llvm/test/CodeGen/X86/avx-schedule.ll +++ b/llvm/test/CodeGen/X86/avx-schedule.ll @@ -910,14 +910,14 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ; BTVER2-LABEL: test_haddpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_haddpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -941,14 +941,14 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ; BTVER2-LABEL: test_haddps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_haddps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -972,14 +972,14 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; ; BTVER2-LABEL: test_hsubpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_hsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -1003,14 +1003,14 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; ; BTVER2-LABEL: test_hsubps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_hsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 |

