summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86SchedPredicates.td7
-rw-r--r--llvm/lib/Target/X86/X86ScheduleBtVer2.td10
-rw-r--r--llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s30
3 files changed, 31 insertions, 16 deletions
diff --git a/llvm/lib/Target/X86/X86SchedPredicates.td b/llvm/lib/Target/X86/X86SchedPredicates.td
index 11b567c18cf..1c7f24375f6 100644
--- a/llvm/lib/Target/X86/X86SchedPredicates.td
+++ b/llvm/lib/Target/X86/X86SchedPredicates.td
@@ -19,6 +19,13 @@
// different zero-idioms.
def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>;
+// A predicate used to identify VPERM that have bits 3 and 7 of their mask set.
+// On some processors, these VPERM instructions are zero-idioms.
+def ZeroIdiomVPERMPredicate : CheckAll<[
+ ZeroIdiomPredicate,
+ CheckImmOperand<3, 0x88>
+]>;
+
// A predicate used to check if a LEA instruction uses all three source
// operands: base, index, and offset.
def IsThreeOperandsLEAPredicate: CheckAll<[
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 2729e7f8e4e..9df0c779264 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -688,6 +688,12 @@ def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
PCMPGTQrr, VPCMPGTQrr,
PCMPGTWrr, VPCMPGTWrr)>;
+def JWriteVPERM2F128 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>,
+ SchedVar<NoSchedPred, [WriteFShuffle256]>
+]>;
+def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>;
+
// This write is used for slow LEA instructions.
def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> {
let Latency = 2;
@@ -762,7 +768,9 @@ def : IsZeroIdiomFunction<[
// ymm variants.
VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
- ], ZeroIdiomPredicate>
+ ], ZeroIdiomPredicate>,
+
+ DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate>
]>;
def : IsDepBreakingFunction<[
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s
index 7600368c7c0..b1669f33405 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s
@@ -330,12 +330,12 @@ vaddps %ymm1, %ymm1, %ymm0
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 403
+# CHECK-NEXT: Total Cycles: 205
# CHECK-NEXT: Total uOps: 400
# CHECK: Dispatch Width: 2
-# CHECK-NEXT: uOps Per Cycle: 0.99
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 1.95
+# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -347,7 +347,7 @@ vaddps %ymm1, %ymm1, %ymm0
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 2 1 1.00 vperm2f128 $136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 2 1 0.50 vperm2f128 $136, %ymm0, %ymm0, %ymm1
# CHECK-NEXT: 2 3 2.00 vaddps %ymm1, %ymm1, %ymm0
# CHECK: Resources:
@@ -368,23 +368,23 @@ vaddps %ymm1, %ymm1, %ymm0
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 - - - - - - -
+# CHECK-NEXT: - - - 2.00 1.00 2.00 1.00 - - - - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
-# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vperm2f128 $136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vperm2f128 $136, %ymm0, %ymm0, %ymm1
# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddps %ymm1, %ymm1, %ymm0
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . . . vperm2f128 $136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [0,1] .DeeeER . . vaddps %ymm1, %ymm1, %ymm0
-# CHECK-NEXT: [1,0] . D==eER . . vperm2f128 $136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [1,1] . D==eeeER . vaddps %ymm1, %ymm1, %ymm0
-# CHECK-NEXT: [2,0] . D====eER . vperm2f128 $136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: [2,1] . D====eeeER vaddps %ymm1, %ymm1, %ymm0
+# CHECK: [0,0] DeER . . vperm2f128 $136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1] .DeeeER . vaddps %ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [1,0] . DeE-R . vperm2f128 $136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1] . DeeeER . vaddps %ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [2,0] . DeE-R . vperm2f128 $136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1] . DeeeER vaddps %ymm1, %ymm1, %ymm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -393,5 +393,5 @@ vaddps %ymm1, %ymm1, %ymm0
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 3.0 0.3 0.0 vperm2f128 $136, %ymm0, %ymm0, %ymm1
-# CHECK-NEXT: 1. 3 3.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0
+# CHECK-NEXT: 0. 3 1.0 1.0 0.7 vperm2f128 $136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1. 3 1.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0
OpenPOWER on IntegriCloud