summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2019-08-22 11:32:47 +0000
committerAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2019-08-22 11:32:47 +0000
commitc6744055adf950b8deaa0291cb475d70ca292740 (patch)
treed4de84d20c42ebc0821c3bfe9c60e8bd21980c25 /llvm/lib
parentaf478e240baa4377158a516ad28039ad0e778f69 (diff)
downloadbcm5719-llvm-c6744055adf950b8deaa0291cb475d70ca292740.tar.gz
bcm5719-llvm-c6744055adf950b8deaa0291cb475d70ca292740.zip
[X86][BtVer2] Fix latency and throughput of XCHG and XADD.
On Jaguar, XCHG has a latency of 1cy and decodes to 2 macro-opcodes. Maximum throughput for XCHG is 1 IPC. The byte exchange has worse latency and decodes to 1 extra uOP; maximum observed throughput is 0.5 IPC. ``` xchgb %cl, %dl # Latency: 2cy - uOPs: 3 - 2 ALU xchgw %cx, %dx # Latency: 1cy - uOPs: 2 - 2 ALU xchgl %ecx, %edx # Latency: 1cy - uOPs: 2 - 2 ALU xchgq %rcx, %rdx # Latency: 1cy - uOPs: 2 - 2 ALU ``` The reg-mem forms of XCHG are atomic operations with an observed latency of 16cy. The resource usage is similar to the XCHGrr variants. The biggest difference is obviously the bus-locking, which prevents the LS to issue other memory uOPs in parallel until the unlocking store uOP is executed. ``` xchgb %cl, (%rsp) # Latency: 16cy - uOPs: 3 - ECX latency: 11cy xchgw %cx, (%rsp) # Latency: 16cy - uOPs: 3 - ECX latency: 11cy xchgl %ecx, (%rsp) # Latency: 16cy - uOPs: 3 - ECX latency: 11cy xchgq %rcx, (%rsp) # Latency: 16cy - uOPs: 3 - ECX latency: 11cy ``` The exchanged in/out register operand becomes available after 11cy from the start of execution. Added test xchg.s to verify that we correctly see that register write committed in 11cy (and not 16cy). Reg-reg XADD instructions have the same latency/throughput than the byte exchange (register-register variant). ``` xaddb %cl, %dl # latency: 2cy - uOPs: 3 - 3 ALU xaddw %cx, %dx # latency: 2cy - uOPs: 3 - 3 ALU xaddl %ecx, %edx # latency: 2cy - uOPs: 3 - 3 ALU xaddq %rcx, %rdx # latency: 2cy - uOPs: 3 - 3 ALU ``` The non-atomic RM variants have a latency of 11cy, and decode to 4 macro-opcodes. They still consume 2 ALU pipes, and the exchange in/out register operand becomes available in 3cy (it matches the 'load-to-use latency'). ``` xaddb %cl, (%rsp) # latency: 11cy - uOPs: 4 - 3 ALU xaddw %cx, (%rsp) # latency: 11cy - uOPs: 4 - 3 ALU xaddl %ecx, (%rsp) # latency: 11cy - uOPs: 4 - 3 ALU xaddq %rcx, (%rsp) # latency: 11cy - uOPs: 4 - 3 ALU ``` The atomic XADD variants execute in 16cy. The in/out register operand is available after 11cy from the start of execution. ``` lock xaddb %cl, (%rsp) # latency: 16cy - uOPs: 4 - 3 ALU -- ECX latency: 11cy lock xaddw %cx, (%rsp) # latency: 16cy - uOPs: 4 - 3 ALU -- ECX latency: 11cy lock xaddl %ecx, (%rsp) # latency: 16cy - uOPs: 4 - 3 ALU -- ECX latency: 11cy lock xaddq %rcx, (%rsp) # latency: 16cy - uOPs: 4 - 3 ALU -- ECX latency: 11cy ``` Added test xadd.s to verify those latencies as well as read-advance values. Differential Revision: https://reviews.llvm.org/D66535 llvm-svn: 369642
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86ScheduleBtVer2.td92
1 files changed, 91 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 6849b258f8e..b5dce16b5e3 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -195,7 +195,7 @@ defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>;
defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
-defm : X86WriteRes<WriteXCHG, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>;
defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 2>;
defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 1], 2>;
@@ -395,6 +395,96 @@ def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
NOT8m, NOT16m, NOT32m, NOT64m,
NEG8m, NEG16m, NEG32m, NEG64m)>;
+def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
+ XADD32rr, XADD64rr)>;
+
+// This write defines the latency of the in/out register operand of a non-atomic
+// XADDrm. This is the first of a pair of writes that model non-atomic
+// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
+//
+// We need two writes because the instruction latency differs from the output
+// register operand latency. In particular, the first write describes the first
+// (and only) output register operand of the instruction. However, the
+// instruction latency is set to the MAX of all the write latencies. That's why
+// a second write is needed in this case (see example below).
+//
+// Example:
+// XADD %ecx, (%rsp) ## Instruction latency: 11cy
+// ## ECX write Latency: 3cy
+//
+// Register ECX becomes available in 3 cycles. That is because the value of ECX
+// is exchanged with the value read from the stack pointer, and the load-to-use
+// latency is assumed to be 3cy.
+def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 3; // load-to-use latency
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XADDrm. This is the first of a sequence of two writes used to model atomic
+// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
+//
+//
+// Example:
+// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy
+// ## ECX write Latency: 11cy
+//
+// The value of ECX becomes available only after 11cy from the start of
+// execution. This write is used to specifically set that operand latency.
+def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XCHGrm. This write is the first of a sequence of two writes that describe
+// atomic XCHG operations. We need two writes because the instruction latency
+// differs from the output register write latency. We want to make sure that
+// the output register operand becomes visible after 11cy. However, we want to
+// set the instruction latency to 16cy.
+def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 1;
+}
+
+def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 16;
+ let ResourceCycles = [16, 16];
+ let NumMicroOps = 1;
+}
+
+def JWriteXADDrm_Part1 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]>
+]>;
+
+def JWriteXADDrm_Part2 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]>
+]>;
+
+def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
+ (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
+ LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
+ (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
+
+
////////////////////////////////////////////////////////////////////////////////
// Floating point. This covers both scalar and vector operations.
////////////////////////////////////////////////////////////////////////////////
OpenPOWER on IntegriCloud