summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s16
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s13
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s20
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-4.s79
-rw-r--r--llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s154
-rw-r--r--llvm/test/tools/llvm-mca/X86/option-all-views-1.s13
-rw-r--r--llvm/test/tools/llvm-mca/X86/option-all-views-2.s13
-rw-r--r--llvm/test/tools/llvm-mca/X86/option-no-stats-1.s13
-rw-r--r--llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp262
-rw-r--r--llvm/tools/llvm-mca/Views/BottleneckAnalysis.h132
10 files changed, 676 insertions, 39 deletions
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s
index 16577cf8b39..4091ad8d715 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-1.s
@@ -23,6 +23,22 @@ add %edx, %eax
# CHECK-NEXT: - Register Dependencies [ 94.04% ]
# CHECK-NEXT: - Memory Dependencies [ 0.00% ]
+# CHECK: Critical sequence based on the simulation:
+
+# CHECK: Instruction Dependency Information
+# CHECK-NEXT: +----< 3. addl %edx, %eax
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. addl %eax, %ebx ## REGISTER dependency: %eax
+# CHECK-NEXT: +----> 1. addl %ebx, %ecx ## REGISTER dependency: %ebx
+# CHECK-NEXT: +----> 2. addl %ecx, %edx ## REGISTER dependency: %ecx
+# CHECK-NEXT: +----> 3. addl %edx, %eax ## REGISTER dependency: %edx
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. addl %eax, %ebx ## REGISTER dependency: %eax
+
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s
index 83444d422ad..4ff19360c92 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-2.s
@@ -22,6 +22,19 @@ vhaddps %xmm0, %xmm0, %xmm1
# CHECK-NEXT: - Register Dependencies [ 0.00% ]
# CHECK-NEXT: - Memory Dependencies [ 0.00% ]
+# CHECK: Critical sequence based on the simulation:
+
+# CHECK: Instruction Dependency Information
+# CHECK-NEXT: +----< 0. vhaddps %xmm0, %xmm0, %xmm1
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. vhaddps %xmm0, %xmm0, %xmm1 ## RESOURCE interference: JFPA [ probability: 99% ]
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. vhaddps %xmm0, %xmm0, %xmm1 ## RESOURCE interference: JFPA [ probability: 99% ]
+
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s
index bedfef1d95f..3b0639a0c5a 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-3.s
@@ -27,6 +27,26 @@ vmovaps %xmm0, 48(%rdi)
# CHECK-NEXT: - Register Dependencies [ 83.24% ]
# CHECK-NEXT: - Memory Dependencies [ 99.89% ]
+# CHECK: Critical sequence based on the simulation:
+
+# CHECK: Instruction Dependency Information
+# CHECK-NEXT: +----< 7. vmovaps %xmm0, 48(%rdi)
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. vmovaps (%rsi), %xmm0 ## MEMORY dependency.
+# CHECK-NEXT: +----> 1. vmovaps %xmm0, (%rdi) ## REGISTER dependency: %xmm0
+# CHECK-NEXT: +----> 2. vmovaps 16(%rsi), %xmm0 ## MEMORY dependency.
+# CHECK-NEXT: +----> 3. vmovaps %xmm0, 16(%rdi) ## REGISTER dependency: %xmm0
+# CHECK-NEXT: +----> 4. vmovaps 32(%rsi), %xmm0 ## MEMORY dependency.
+# CHECK-NEXT: +----> 5. vmovaps %xmm0, 32(%rdi) ## REGISTER dependency: %xmm0
+# CHECK-NEXT: +----> 6. vmovaps 48(%rsi), %xmm0 ## MEMORY dependency.
+# CHECK-NEXT: +----> 7. vmovaps %xmm0, 48(%rdi) ## REGISTER dependency: %xmm0
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. vmovaps (%rsi), %xmm0 ## MEMORY dependency.
+
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-4.s b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-4.s
new file mode 100644
index 00000000000..45f01aaef2c
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/bottleneck-hints-4.s
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -bottleneck-analysis < %s | FileCheck %s
+
+vmulps %xmm0, %xmm1, %xmm2
+vhaddps %xmm2, %xmm2, %xmm3
+vhaddps %xmm3, %xmm3, %xmm4
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 300
+# CHECK-NEXT: Total Cycles: 211
+# CHECK-NEXT: Total uOps: 300
+
+# CHECK: Dispatch Width: 2
+# CHECK-NEXT: uOps Per Cycle: 1.42
+# CHECK-NEXT: IPC: 1.42
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Cycles with backend pressure increase [ 40.76% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT: Resource Pressure [ 39.34% ]
+# CHECK-NEXT: - JFPA [ 39.34% ]
+# CHECK-NEXT: - JFPU0 [ 39.34% ]
+# CHECK-NEXT: Data Dependencies: [ 1.42% ]
+# CHECK-NEXT: - Register Dependencies [ 1.42% ]
+# CHECK-NEXT: - Memory Dependencies [ 0.00% ]
+
+# CHECK: Critical sequence based on the simulation:
+
+# CHECK: Instruction Dependency Information
+# CHECK-NEXT: +----< 2. vhaddps %xmm3, %xmm3, %xmm4
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: | 0. vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
+# CHECK-NEXT: +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 2 1.00 vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 4 1.00 vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 1 4 1.00 vhaddps %xmm3, %xmm3, %xmm4
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - 2.00 1.00 2.00 1.00 - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmulps %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vhaddps %xmm3, %xmm3, %xmm4
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
new file mode 100644
index 00000000000..6c144005c20
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/bottleneck-analysis.s
@@ -0,0 +1,154 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mcpu=skylake -bottleneck-analysis < %s | FileCheck %s
+
+.LBB0_4:
+ vmovups (%rsi,%rax,2), %xmm0
+ vpermilps $255, %xmm0, %xmm7
+ vmulps -24(%rsp), %xmm7, %xmm8
+ vpermilps $170, %xmm0, %xmm6
+ vpermilps $85, %xmm0, %xmm5
+ vbroadcastss %xmm0, %xmm0
+ vfmadd231ps %xmm9, %xmm6, %xmm8
+ vfmadd213ps %xmm8, %xmm10, %xmm5
+ vfmadd213ps %xmm5, %xmm11, %xmm0
+ vfmadd213ps %xmm0, %xmm12, %xmm4
+ vfmadd213ps %xmm4, %xmm13, %xmm1
+ vmovaps %xmm7, %xmm4
+ vfmadd213ps %xmm1, %xmm14, %xmm2
+ vmovaps %xmm6, %xmm1
+ vfmadd213ps %xmm2, %xmm15, %xmm3
+ vpermilps $170, %xmm3, %xmm0
+ vmovups %xmm3, (%rdx,%rax)
+ vpermilps $255, %xmm3, %xmm2
+ addq $16, %rax
+ decl %ecx
+ vmovaps %xmm0, %xmm3
+ jne .LBB0_4
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 2200
+# CHECK-NEXT: Total Cycles: 1039
+# CHECK-NEXT: Total uOps: 2400
+
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 2.31
+# CHECK-NEXT: IPC: 2.12
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK: Cycles with backend pressure increase [ 92.69% ]
+# CHECK-NEXT: Throughput Bottlenecks:
+# CHECK-NEXT: Resource Pressure [ 46.78% ]
+# CHECK-NEXT: - SKLPort0 [ 14.24% ]
+# CHECK-NEXT: - SKLPort1 [ 14.24% ]
+# CHECK-NEXT: - SKLPort5 [ 46.49% ]
+# CHECK-NEXT: - SKLPort6 [ 8.66% ]
+# CHECK-NEXT: Data Dependencies: [ 64.97% ]
+# CHECK-NEXT: - Register Dependencies [ 64.97% ]
+# CHECK-NEXT: - Memory Dependencies [ 0.00% ]
+
+# CHECK: Critical sequence based on the simulation:
+
+# CHECK: Instruction Dependency Information
+# CHECK-NEXT: +----< 18. addq $16, %rax
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. vmovups (%rsi,%rax,2), %xmm0 ## REGISTER dependency: %rax
+# CHECK-NEXT: | 1. vpermilps $255, %xmm0, %xmm7
+# CHECK-NEXT: | 2. vmulps -24(%rsp), %xmm7, %xmm8
+# CHECK-NEXT: +----> 3. vpermilps $170, %xmm0, %xmm6 ## REGISTER dependency: %xmm0
+# CHECK-NEXT: | 4. vpermilps $85, %xmm0, %xmm5
+# CHECK-NEXT: | 5. vbroadcastss %xmm0, %xmm0
+# CHECK-NEXT: +----> 6. vfmadd231ps %xmm9, %xmm6, %xmm8 ## REGISTER dependency: %xmm6
+# CHECK-NEXT: +----> 7. vfmadd213ps %xmm8, %xmm10, %xmm5 ## REGISTER dependency: %xmm8
+# CHECK-NEXT: +----> 8. vfmadd213ps %xmm5, %xmm11, %xmm0 ## REGISTER dependency: %xmm5
+# CHECK-NEXT: +----> 9. vfmadd213ps %xmm0, %xmm12, %xmm4 ## REGISTER dependency: %xmm0
+# CHECK-NEXT: +----> 10. vfmadd213ps %xmm4, %xmm13, %xmm1 ## REGISTER dependency: %xmm4
+# CHECK-NEXT: | 11. vmovaps %xmm7, %xmm4
+# CHECK-NEXT: +----> 12. vfmadd213ps %xmm1, %xmm14, %xmm2 ## REGISTER dependency: %xmm1
+# CHECK-NEXT: | 13. vmovaps %xmm6, %xmm1
+# CHECK-NEXT: | 14. vfmadd213ps %xmm2, %xmm15, %xmm3
+# CHECK-NEXT: | 15. vpermilps $170, %xmm3, %xmm0
+# CHECK-NEXT: | 16. vmovups %xmm3, (%rdx,%rax)
+# CHECK-NEXT: | 17. vpermilps $255, %xmm3, %xmm2
+# CHECK-NEXT: | 18. addq $16, %rax
+# CHECK-NEXT: | 19. decl %ecx
+# CHECK-NEXT: | 20. vmovaps %xmm0, %xmm3
+# CHECK-NEXT: | 21. jne .LBB0_4
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 2. vmulps -24(%rsp), %xmm7, %xmm8 ## RESOURCE interference: SKLPort1 [ probability: 45% ]
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.50 * vmovups (%rsi,%rax,2), %xmm0
+# CHECK-NEXT: 1 1 1.00 vpermilps $255, %xmm0, %xmm7
+# CHECK-NEXT: 2 10 0.50 * vmulps -24(%rsp), %xmm7, %xmm8
+# CHECK-NEXT: 1 1 1.00 vpermilps $170, %xmm0, %xmm6
+# CHECK-NEXT: 1 1 1.00 vpermilps $85, %xmm0, %xmm5
+# CHECK-NEXT: 1 1 1.00 vbroadcastss %xmm0, %xmm0
+# CHECK-NEXT: 1 4 0.50 vfmadd231ps %xmm9, %xmm6, %xmm8
+# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm8, %xmm10, %xmm5
+# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm5, %xmm11, %xmm0
+# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm0, %xmm12, %xmm4
+# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm4, %xmm13, %xmm1
+# CHECK-NEXT: 1 1 0.33 vmovaps %xmm7, %xmm4
+# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm1, %xmm14, %xmm2
+# CHECK-NEXT: 1 1 0.33 vmovaps %xmm6, %xmm1
+# CHECK-NEXT: 1 4 0.50 vfmadd213ps %xmm2, %xmm15, %xmm3
+# CHECK-NEXT: 1 1 1.00 vpermilps $170, %xmm3, %xmm0
+# CHECK-NEXT: 2 1 1.00 * vmovups %xmm3, (%rdx,%rax)
+# CHECK-NEXT: 1 1 1.00 vpermilps $255, %xmm3, %xmm2
+# CHECK-NEXT: 1 1 0.25 addq $16, %rax
+# CHECK-NEXT: 1 1 0.25 decl %ecx
+# CHECK-NEXT: 1 1 0.33 vmovaps %xmm0, %xmm3
+# CHECK-NEXT: 1 1 0.50 jne .LBB0_4
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - SKLDivider
+# CHECK-NEXT: [1] - SKLFPDivider
+# CHECK-NEXT: [2] - SKLPort0
+# CHECK-NEXT: [3] - SKLPort1
+# CHECK-NEXT: [4] - SKLPort2
+# CHECK-NEXT: [5] - SKLPort3
+# CHECK-NEXT: [6] - SKLPort4
+# CHECK-NEXT: [7] - SKLPort5
+# CHECK-NEXT: [8] - SKLPort6
+# CHECK-NEXT: [9] - SKLPort7
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
+# CHECK-NEXT: - - 5.52 5.53 1.01 1.03 1.00 6.02 2.93 0.96
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
+# CHECK-NEXT: - - - - 0.04 0.96 - - - - vmovups (%rsi,%rax,2), %xmm0
+# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $255, %xmm0, %xmm7
+# CHECK-NEXT: - - 0.03 0.97 0.96 0.04 - - - - vmulps -24(%rsp), %xmm7, %xmm8
+# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $170, %xmm0, %xmm6
+# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $85, %xmm0, %xmm5
+# CHECK-NEXT: - - - - - - - 1.00 - - vbroadcastss %xmm0, %xmm0
+# CHECK-NEXT: - - 0.95 0.05 - - - - - - vfmadd231ps %xmm9, %xmm6, %xmm8
+# CHECK-NEXT: - - 0.50 0.50 - - - - - - vfmadd213ps %xmm8, %xmm10, %xmm5
+# CHECK-NEXT: - - 0.92 0.08 - - - - - - vfmadd213ps %xmm5, %xmm11, %xmm0
+# CHECK-NEXT: - - 0.95 0.05 - - - - - - vfmadd213ps %xmm0, %xmm12, %xmm4
+# CHECK-NEXT: - - 0.51 0.49 - - - - - - vfmadd213ps %xmm4, %xmm13, %xmm1
+# CHECK-NEXT: - - 0.52 0.48 - - - - - - vmovaps %xmm7, %xmm4
+# CHECK-NEXT: - - 0.49 0.51 - - - - - - vfmadd213ps %xmm1, %xmm14, %xmm2
+# CHECK-NEXT: - - 0.04 0.95 - - - 0.01 - - vmovaps %xmm6, %xmm1
+# CHECK-NEXT: - - 0.51 0.49 - - - - - - vfmadd213ps %xmm2, %xmm15, %xmm3
+# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $170, %xmm3, %xmm0
+# CHECK-NEXT: - - - - 0.01 0.03 1.00 - - 0.96 vmovups %xmm3, (%rdx,%rax)
+# CHECK-NEXT: - - - - - - - 1.00 - - vpermilps $255, %xmm3, %xmm2
+# CHECK-NEXT: - - - - - - - - 1.00 - addq $16, %rax
+# CHECK-NEXT: - - 0.04 0.01 - - - 0.01 0.94 - decl %ecx
+# CHECK-NEXT: - - 0.05 0.95 - - - - - - vmovaps %xmm0, %xmm3
+# CHECK-NEXT: - - 0.01 - - - - - 0.99 - jne .LBB0_4
diff --git a/llvm/test/tools/llvm-mca/X86/option-all-views-1.s b/llvm/test/tools/llvm-mca/X86/option-all-views-1.s
index ed097e70734..27e3a67374c 100644
--- a/llvm/test/tools/llvm-mca/X86/option-all-views-1.s
+++ b/llvm/test/tools/llvm-mca/X86/option-all-views-1.s
@@ -25,6 +25,19 @@ add %eax, %eax
# FULLREPORT-NEXT: - Register Dependencies [ 76.70% ]
# FULLREPORT-NEXT: - Memory Dependencies [ 0.00% ]
+# FULLREPORT: Critical sequence based on the simulation:
+
+# FULLREPORT: Instruction Dependency Information
+# FULLREPORT-NEXT: +----< 0. addl %eax, %eax
+# FULLREPORT-NEXT: |
+# FULLREPORT-NEXT: | < loop carried >
+# FULLREPORT-NEXT: |
+# FULLREPORT-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax
+# FULLREPORT-NEXT: |
+# FULLREPORT-NEXT: | < loop carried >
+# FULLREPORT-NEXT: |
+# FULLREPORT-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax
+
# DEFAULTREPORT: Instruction Info:
# DEFAULTREPORT-NEXT: [1]: #uOps
# DEFAULTREPORT-NEXT: [2]: Latency
diff --git a/llvm/test/tools/llvm-mca/X86/option-all-views-2.s b/llvm/test/tools/llvm-mca/X86/option-all-views-2.s
index efa66be1fb0..a19da0addba 100644
--- a/llvm/test/tools/llvm-mca/X86/option-all-views-2.s
+++ b/llvm/test/tools/llvm-mca/X86/option-all-views-2.s
@@ -24,6 +24,19 @@ add %eax, %eax
# ALL-NEXT: - Register Dependencies [ 76.70% ]
# ALL-NEXT: - Memory Dependencies [ 0.00% ]
+# ALL: Critical sequence based on the simulation:
+
+# ALL: Instruction Dependency Information
+# ALL-NEXT: +----< 0. addl %eax, %eax
+# ALL-NEXT: |
+# ALL-NEXT: | < loop carried >
+# ALL-NEXT: |
+# ALL-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax
+# ALL-NEXT: |
+# ALL-NEXT: | < loop carried >
+# ALL-NEXT: |
+# ALL-NEXT: +----> 0. addl %eax, %eax ## REGISTER dependency: %eax
+
# ALL: Instruction Info:
# ALL-NEXT: [1]: #uOps
# ALL-NEXT: [2]: Latency
diff --git a/llvm/test/tools/llvm-mca/X86/option-no-stats-1.s b/llvm/test/tools/llvm-mca/X86/option-no-stats-1.s
index dc843bb2761..d505dd9bc3e 100644
--- a/llvm/test/tools/llvm-mca/X86/option-no-stats-1.s
+++ b/llvm/test/tools/llvm-mca/X86/option-no-stats-1.s
@@ -22,6 +22,19 @@ add %edi, %eax
# CHECK-NEXT: - Register Dependencies [ 76.70% ]
# CHECK-NEXT: - Memory Dependencies [ 0.00% ]
+# CHECK: Critical sequence based on the simulation:
+
+# CHECK: Instruction Dependency Information
+# CHECK-NEXT: +----< 0. addl %edi, %eax
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. addl %edi, %eax ## REGISTER dependency: %eax
+# CHECK-NEXT: |
+# CHECK-NEXT: | < loop carried >
+# CHECK-NEXT: |
+# CHECK-NEXT: +----> 0. addl %edi, %eax ## REGISTER dependency: %eax
+
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
index 8c825271e4f..560c6c6e8a3 100644
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp
@@ -16,6 +16,7 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MCA/Support.h"
#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
namespace llvm {
namespace mca {
@@ -161,12 +162,219 @@ void DependencyGraph::dumpDependencyEdge(raw_ostream &OS,
OS << " - MEMORY";
} else {
assert(DE.Type == DependencyEdge::DT_RESOURCE &&
- "Unexpected unsupported dependency type!");
+ "Unsupported dependency type!");
OS << " - RESOURCE MASK: " << DE.ResourceOrRegID;
}
OS << " - CYCLES: " << DE.Cost << '\n';
}
+#endif // NDEBUG
+
+void DependencyGraph::initializeRootSet(
+ SmallVectorImpl<unsigned> &RootSet) const {
+ for (unsigned I = 0, E = Nodes.size(); I < E; ++I) {
+ const DGNode &N = Nodes[I];
+ if (N.NumPredecessors == 0 && !N.OutgoingEdges.empty())
+ RootSet.emplace_back(I);
+ }
+}
+
+void DependencyGraph::propagateThroughEdges(
+ SmallVectorImpl<unsigned> &RootSet) {
+ SmallVector<unsigned, 8> ToVisit;
+
+ // A critical sequence is computed as the longest path from a node of the
+ // RootSet to a leaf node (i.e. a node with no successors). The RootSet is
+ // composed of nodes with at least one successor, and no predecessors.
+ //
+ // Each node of the graph starts with an initial default cost of zero. The
+ // cost of a node is a measure of criticality: the higher the cost, the bigger
+ // is the performance impact.
+ //
+ // This algorithm is very similar to a (reverse) Dijkstra. Every iteration of
+ // the inner loop selects (i.e. visits) a node N from a set of `unvisited
+ // nodes`, and then propagates the cost of N to all its neighbors.
+ //
+ // The `unvisited nodes` set initially contains all the nodes from the
+ // RootSet. A node N is added to the `unvisited nodes` if all its
+ // predecessors have been visited already.
+ //
+ // For simplicity, every node tracks the number of unvisited incoming edges in
+ // field `NumVisitedPredecessors`. When the value of that field drops to
+ // zero, then the corresponding node is added to a `ToVisit` set.
+ //
+ // At the end of every iteration of the outer loop, set `ToVisit` becomes our
+ // new `unvisited nodes` set.
+ //
+ // The algorithm terminates when the set of unvisited nodes (i.e. our RootSet)
+ // is empty. This algorithm works under the assumption that the graph is
+ // acyclic.
+ do {
+ for (unsigned IID : RootSet) {
+ const DGNode &N = Nodes[IID];
+ for (const DependencyEdge &DepEdge : N.OutgoingEdges) {
+ unsigned ToIID = DepEdge.ToIID;
+ DGNode &To = Nodes[ToIID];
+ uint64_t Cost = N.Cost + DepEdge.Dep.Cost;
+ // Check if this is the most expensive incoming edge seen so far. In
+ // case, update the total cost of the destination node (ToIID), as well
+ // its field `CriticalPredecessor`.
+ if (Cost > To.Cost) {
+ To.CriticalPredecessor = DepEdge;
+ To.Cost = Cost;
+ To.Depth = N.Depth + 1;
+ }
+ To.NumVisitedPredecessors++;
+ if (To.NumVisitedPredecessors == To.NumPredecessors)
+ ToVisit.emplace_back(ToIID);
+ }
+ }
+
+ std::swap(RootSet, ToVisit);
+ ToVisit.clear();
+ } while (!RootSet.empty());
+}
+
+void DependencyGraph::getCriticalSequence(
+ SmallVectorImpl<const DependencyEdge *> &Seq) const {
+ // At this stage, nodes of the graph have been already visited, and costs have
+ // been propagated through the edges (see method `propagateThroughEdges()`).
+
+ // Identify the node N with the highest cost in the graph. By construction,
+ // that node is the last instruction of our critical sequence.
+ // Field N.Depth would tell us the total length of the sequence.
+ //
+ // To obtain the sequence of critical edges, we simply follow the chain of critical
+ // predecessors starting from node N (field DGNode::CriticalPredecessor).
+ const auto It = std::max_element(
+ Nodes.begin(), Nodes.end(),
+ [](const DGNode &Lhs, const DGNode &Rhs) { return Lhs.Cost < Rhs.Cost; });
+ unsigned IID = std::distance(Nodes.begin(), It);
+ Seq.resize(Nodes[IID].Depth);
+ for (unsigned I = Seq.size(), E = 0; I > E; --I) {
+ const DGNode &N = Nodes[IID];
+ Seq[I - 1] = &N.CriticalPredecessor;
+ IID = N.CriticalPredecessor.FromIID;
+ }
+}
+
+static void printInstruction(formatted_raw_ostream &FOS,
+ const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
+ const MCInst &MCI,
+ bool UseDifferentColor = false) {
+ std::string Instruction;
+ raw_string_ostream InstrStream(Instruction);
+
+ FOS.PadToColumn(14);
+
+ MCIP.printInst(&MCI, InstrStream, "", STI);
+ InstrStream.flush();
+
+ if (UseDifferentColor)
+ FOS.changeColor(raw_ostream::CYAN, true, false);
+ FOS << StringRef(Instruction).ltrim();
+ if (UseDifferentColor)
+ FOS.resetColor();
+}
+
+void BottleneckAnalysis::printCriticalSequence(raw_ostream &OS) const {
+ SmallVector<const DependencyEdge *, 16> Seq;
+ DG.getCriticalSequence(Seq);
+ if (Seq.empty())
+ return;
+
+ OS << "\nCritical sequence based on the simulation:\n\n";
+
+ const DependencyEdge &FirstEdge = *Seq[0];
+ unsigned FromIID = FirstEdge.FromIID % Source.size();
+ unsigned ToIID = FirstEdge.ToIID % Source.size();
+ bool IsLoopCarried = FromIID >= ToIID;
+
+ formatted_raw_ostream FOS(OS);
+ FOS.PadToColumn(14);
+ FOS << "Instruction";
+ FOS.PadToColumn(58);
+ FOS << "Dependency Information";
+
+ bool HasColors = FOS.has_colors();
+ unsigned CurrentIID = 0;
+ if (IsLoopCarried) {
+ FOS << "\n +----< " << FromIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[FromIID], HasColors);
+ FOS << "\n |\n | < loop carried > \n |";
+ } else {
+ while (CurrentIID < FromIID) {
+ FOS << "\n " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+ CurrentIID++;
+ }
+
+ FOS << "\n +----< " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+ CurrentIID++;
+ }
+
+ for (const DependencyEdge *&DE : Seq) {
+ ToIID = DE->ToIID % Source.size();
+ unsigned LastIID = CurrentIID > ToIID ? Source.size() : ToIID;
+
+ while (CurrentIID < LastIID) {
+ FOS << "\n | " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+ CurrentIID++;
+ }
+
+ if (CurrentIID == ToIID) {
+ FOS << "\n +----> " << ToIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID], HasColors);
+ } else {
+ FOS << "\n |\n | < loop carried > \n |"
+ << "\n +----> " << ToIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[ToIID], HasColors);
+ }
+ FOS.PadToColumn(58);
+
+ const DependencyEdge::Dependency &Dep = DE->Dep;
+ if (HasColors)
+ FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
+
+ if (Dep.Type == DependencyEdge::DT_REGISTER) {
+ FOS << "## REGISTER dependency: ";
+ if (HasColors)
+ FOS.changeColor(raw_ostream::MAGENTA, true, false);
+ MCIP.printRegName(FOS, Dep.ResourceOrRegID);
+ } else if (Dep.Type == DependencyEdge::DT_MEMORY) {
+ FOS << "## MEMORY dependency.";
+ } else {
+ assert(Dep.Type == DependencyEdge::DT_RESOURCE &&
+ "Unsupported dependency type!");
+ FOS << "## RESOURCE interference: ";
+ if (HasColors)
+ FOS.changeColor(raw_ostream::MAGENTA, true, false);
+ FOS << Tracker.resolveResourceName(Dep.ResourceOrRegID);
+ if (HasColors) {
+ FOS.resetColor();
+ FOS.changeColor(raw_ostream::SAVEDCOLOR, true, false);
+ }
+ FOS << " [ probability: " << ((DE->Frequency * 100) / Iterations)
+ << "% ]";
+ }
+ if (HasColors)
+ FOS.resetColor();
+ ++CurrentIID;
+ }
+
+ while (CurrentIID < Source.size()) {
+ FOS << "\n " << CurrentIID << ".";
+ printInstruction(FOS, STI, MCIP, Source[CurrentIID]);
+ CurrentIID++;
+ }
+
+ FOS << '\n';
+ FOS.flush();
+}
+
+#ifndef NDEBUG
void DependencyGraph::dump(raw_ostream &OS, MCInstPrinter &MCIP) const {
OS << "\nREG DEPS\n";
for (const DGNode &Node : Nodes)
@@ -200,10 +408,11 @@ void DependencyGraph::addDependency(unsigned From, unsigned To,
if (It != Vec.end()) {
It->Dep.Cost += Dep.Cost;
+ It->Frequency++;
return;
}
- DependencyEdge DE = {Dep, From, To};
+ DependencyEdge DE = {Dep, From, To, 1};
Vec.emplace_back(DE);
NodeTo.NumPredecessors++;
}
@@ -211,7 +420,7 @@ void DependencyGraph::addDependency(unsigned From, unsigned To,
BottleneckAnalysis::BottleneckAnalysis(const MCSubtargetInfo &sti,
MCInstPrinter &Printer,
ArrayRef<MCInst> S, unsigned NumIter)
- : STI(sti), Tracker(STI.getSchedModel()), DG(S.size() * 3),
+ : STI(sti), MCIP(Printer), Tracker(STI.getSchedModel()), DG(S.size() * 3),
Source(S), Iterations(NumIter), TotalCycles(0),
PressureIncreasedBecauseOfResources(false),
PressureIncreasedBecauseOfRegisterDependencies(false),
@@ -274,38 +483,38 @@ void BottleneckAnalysis::onEvent(const HWInstructionEvent &Event) {
const Instruction &IS = *Event.IR.getInstruction();
unsigned To = IID % Source.size();
- unsigned Cycles = Tracker.getResourcePressureCycles(IID);
- if (Cycles) {
- uint64_t ResourceMask = IS.getCriticalResourceMask();
- SmallVector<std::pair<unsigned, unsigned>, 4> Users;
- while (ResourceMask) {
- uint64_t Current = ResourceMask & (-ResourceMask);
- Tracker.getResourceUsers(Current, Users);
- for (const std::pair<unsigned, unsigned> &U : Users) {
- unsigned Cost = std::min(U.second, Cycles);
- addResourceDep(U.first % Source.size(), To, Current, Cost);
- }
- Users.clear();
- ResourceMask ^= Current;
- }
+ unsigned Cycles = 2 * Tracker.getResourcePressureCycles(IID);
+ uint64_t ResourceMask = IS.getCriticalResourceMask();
+ SmallVector<std::pair<unsigned, unsigned>, 4> Users;
+ while (ResourceMask) {
+ uint64_t Current = ResourceMask & (-ResourceMask);
+ Tracker.getResourceUsers(Current, Users);
+ for (const std::pair<unsigned, unsigned> &U : Users)
+ addResourceDep(U.first % Source.size(), To, Current, U.second + Cycles);
+ Users.clear();
+ ResourceMask ^= Current;
}
- Cycles = Tracker.getRegisterPressureCycles(IID);
- if (Cycles) {
- const CriticalDependency &RegDep = IS.getCriticalRegDep();
+ const CriticalDependency &RegDep = IS.getCriticalRegDep();
+ if (RegDep.Cycles) {
+ Cycles = RegDep.Cycles + 2 * Tracker.getRegisterPressureCycles(IID);
unsigned From = RegDep.IID % Source.size();
addRegisterDep(From, To, RegDep.RegID, Cycles);
}
- Cycles = Tracker.getMemoryPressureCycles(IID);
- if (Cycles) {
- const CriticalDependency &MemDep = IS.getCriticalMemDep();
+ const CriticalDependency &MemDep = IS.getCriticalMemDep();
+ if (MemDep.Cycles) {
+ Cycles = MemDep.Cycles + 2 * Tracker.getMemoryPressureCycles(IID);
unsigned From = MemDep.IID % Source.size();
addMemoryDep(From, To, Cycles);
}
Tracker.handleInstructionIssuedEvent(
static_cast<const HWInstructionIssuedEvent &>(Event));
+
+ // Check if this is the last simulated instruction.
+ if (IID == ((Iterations * Source.size()) - 1))
+ DG.finalizeGraph();
}
void BottleneckAnalysis::onEvent(const HWPressureEvent &Event) {
@@ -356,7 +565,7 @@ void BottleneckAnalysis::onCycleEnd() {
void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
if (!SeenStallCycles || !BPI.PressureIncreaseCycles) {
- OS << "\nNo resource or data dependency bottlenecks discovered.\n";
+ OS << "\n\nNo resource or data dependency bottlenecks discovered.\n";
return;
}
@@ -370,7 +579,7 @@ void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
double MemDepPressurePerCycle =
(double)BPI.MemoryDependencyCycles * 100 / TotalCycles;
- OS << "\nCycles with backend pressure increase [ "
+ OS << "\n\nCycles with backend pressure increase [ "
<< format("%.2f", floor((PressurePerCycle * 100) + 0.5) / 100) << "% ]";
OS << "\nThroughput Bottlenecks: "
@@ -399,7 +608,7 @@ void BottleneckAnalysis::printBottleneckHints(raw_ostream &OS) const {
<< "% ]";
OS << "\n - Memory Dependencies [ "
<< format("%.2f", floor((MemDepPressurePerCycle * 100) + 0.5) / 100)
- << "% ]\n\n";
+ << "% ]\n";
}
void BottleneckAnalysis::printView(raw_ostream &OS) const {
@@ -408,6 +617,7 @@ void BottleneckAnalysis::printView(raw_ostream &OS) const {
printBottleneckHints(TempStream);
TempStream.flush();
OS << Buffer;
+ printCriticalSequence(OS);
}
} // namespace mca.
diff --git a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
index f8302496cef..7564b1a4820 100644
--- a/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
+++ b/llvm/tools/llvm-mca/Views/BottleneckAnalysis.h
@@ -10,18 +10,71 @@
/// This file implements the bottleneck analysis view.
///
/// This view internally observes backend pressure increase events in order to
-/// identify potential sources of bottlenecks.
+/// identify problematic data dependencies and processor resource interferences.
///
-/// Example of bottleneck analysis report:
+/// Example of bottleneck analysis report for a dot-product on X86 btver2:
///
-/// Cycles with backend pressure increase [ 33.40% ]
-/// Throughput Bottlenecks:
-/// Resource Pressure [ 0.52% ]
-/// - JLAGU [ 0.52% ]
-/// Data Dependencies: [ 32.88% ]
-/// - Register Dependencies [ 32.88% ]
-/// - Memory Dependencies [ 0.00% ]
+/// Cycles with backend pressure increase [ 40.76% ]
+/// Throughput Bottlenecks:
+/// Resource Pressure [ 39.34% ]
+/// - JFPA [ 39.34% ]
+/// - JFPU0 [ 39.34% ]
+/// Data Dependencies: [ 1.42% ]
+/// - Register Dependencies [ 1.42% ]
+/// - Memory Dependencies [ 0.00% ]
///
+/// According to the example, backend pressure increased during the 40.76% of
+/// the simulated cycles. In particular, the major cause of backend pressure
+/// increases was the contention on floating point adder JFPA accessible from
+/// pipeline resource JFPU0.
+///
+/// At the end of each cycle, if pressure on the simulated out-of-order buffers
+/// has increased, a backend pressure event is reported.
+/// In particular, this occurs when there is a delta between the number of uOps
+/// dispatched and the number of uOps issued to the underlying pipelines.
+///
+/// The bottleneck analysis view is also responsible for identifying and printing
+/// the most "critical" sequence of dependent instructions according to the
+/// simulated run.
+///
+/// Below is the critical sequence computed for the dot-product example on
+/// btver2:
+///
+/// Instruction Dependency Information
+/// +----< 2. vhaddps %xmm3, %xmm3, %xmm4
+/// |
+/// | < loop carried >
+/// |
+/// | 0. vmulps %xmm0, %xmm0, %xmm2
+/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
+/// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3
+/// |
+/// | < loop carried >
+/// |
+/// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
+///
+///
+/// The algorithm that computes the critical sequence is very similar to a
+/// critical path analysis.
+///
+/// A dependency graph is used internally to track dependencies between nodes.
+/// Nodes of the graph represent instructions from the input assembly sequence,
+/// and edges of the graph represent data dependencies or processor resource
+/// interferences.
+///
+/// Edges are dynamically 'discovered' by observing instruction state transitions
+/// and backend pressure increase events. Edges are internally ranked based on
+/// their "criticality". A dependency is considered to be critical if it takes a
+/// long time to execute, and if it contributes to backend pressure increases.
+/// Criticality is internally measured in terms of cycles; it is computed for
+/// every edge in the graph as a function of the edge latency and the number of
+/// backend pressure increase cycles contributed by that edge.
+///
+/// At the end of simulation, costs are propagated to nodes through the edges of
+/// the graph, and the most expensive path connecting the root-set (a
+/// set of nodes with no predecessors) to a leaf node is reported as critical
+/// sequence.
+//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
@@ -108,6 +161,13 @@ public:
return Info.ResourcePressureCycles;
}
+ const char *resolveResourceName(uint64_t ResourceMask) const {
+ unsigned Index = getResourceStateIndex(ResourceMask);
+ unsigned ProcResID = ResIdx2ProcResID[Index];
+ const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
+ return PRDesc.Name;
+ }
+
void onInstructionDispatched(unsigned IID);
void onInstructionExecuted(unsigned IID);
@@ -115,14 +175,13 @@ public:
void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
};
-// An edge of a dependency graph.
-// Vertices of the graph are instructions identified by their ID.
+// A dependency edge.
struct DependencyEdge {
enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
// Dependency edge descriptor.
//
- // It describe the dependency reason, as well as the edge cost in cycles.
+ // It specifies the dependency type, as well as the edge cost in cycles.
struct Dependency {
DependencyType Type;
uint64_t ResourceOrRegID;
@@ -130,14 +189,43 @@ struct DependencyEdge {
};
Dependency Dep;
- // Pair of vertices connected by this edge.
unsigned FromIID;
unsigned ToIID;
+
+ // Used by the bottleneck analysis to compute the interference
+ // probability for processor resources.
+ unsigned Frequency;
};
+// A dependency graph used by the bottleneck analysis to describe data
+// dependencies and processor resource interferences between instructions.
+//
+// There is a node (an instance of struct DGNode) for every instruction in the
+// input assembly sequence. Edges of the graph represent dependencies between
+// instructions.
+//
+// Each edge of the graph is associated with a cost value which is used
+// internally to rank dependency based on their impact on the runtime
+// performance (see field DependencyEdge::Dependency::Cost). In general, the
+// higher the cost of an edge, the higher the impact on performance.
+//
+// The cost of a dependency is a function of both the latency and the number of
+// cycles where the dependency has been seen as critical (i.e. contributing to
+// back-pressure increases).
+//
+// Loop carried dependencies are carefully expanded by the bottleneck analysis
+// to guarantee that the graph stays acyclic. To this end, extra nodes are
+// pre-allocated at construction time to describe instructions from "past and
+// future" iterations. The graph is kept acyclic mainly because it simplifies the
+// complexity of the algorithm that computes the critical sequence.
class DependencyGraph {
struct DGNode {
unsigned NumPredecessors;
+ unsigned NumVisitedPredecessors;
+ uint64_t Cost;
+ unsigned Depth;
+
+ DependencyEdge CriticalPredecessor;
SmallVector<DependencyEdge, 8> OutgoingEdges;
};
SmallVector<DGNode, 16> Nodes;
@@ -148,6 +236,9 @@ class DependencyGraph {
void addDependency(unsigned From, unsigned To,
DependencyEdge::Dependency &&DE);
+ void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
+ void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet);
+
#ifndef NDEBUG
void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
MCInstPrinter &MCIP) const;
@@ -170,6 +261,19 @@ public:
addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
}
+ // Called by the bottleneck analysis at the end of simulation to propagate
+ // costs through the edges of the graph, and compute a critical path.
+ void finalizeGraph() {
+ SmallVector<unsigned, 16> RootSet;
+ initializeRootSet(RootSet);
+ propagateThroughEdges(RootSet);
+ }
+
+ // Returns a sequence of edges representing the critical sequence based on the
+ // simulated run. It assumes that the graph has already been finalized (i.e.
+ // method `finalizeGraph()` has already been called on this graph).
+ void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
+
#ifndef NDEBUG
void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
#endif
@@ -178,6 +282,7 @@ public:
/// A view that collects and prints a few performance numbers.
class BottleneckAnalysis : public View {
const MCSubtargetInfo &STI;
+ MCInstPrinter &MCIP;
PressureTracker Tracker;
DependencyGraph DG;
@@ -212,6 +317,7 @@ class BottleneckAnalysis : public View {
// Prints a bottleneck message to OS.
void printBottleneckHints(raw_ostream &OS) const;
+ void printCriticalSequence(raw_ostream &OS) const;
public:
BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
OpenPOWER on IntegriCloud