[AMDGPU][CodeGen] To improve CGEMM performance: combine LDS reads.

hange explores the fact that LDS reads may be reordered even if access the same location. Prior the change, algorithm immediately stops as soon as any memory access encountered between loads that are expected to be merged together. Although, Read-After-Read conflict cannot affect execution correctness. Improves hcBLAS CGEMM manually loop-unrolled kernels performance by 44%. Also improvement expected on any massive sequences of reads from LDS. Differential Revision: https://reviews.llvm.org/D25944 llvm-svn: 285919
author: Alexander Timofeev <Alexander.Timofeev@amd.com> 2016-11-03 14:37:13 +0000
committer: Alexander Timofeev <Alexander.Timofeev@amd.com> 2016-11-03 14:37:13 +0000
commit: f867a40bf60ad813560fe4cc3d2cc100472ffef4 (patch)
tree: e888ef6d503dc980fc536452f72a71ab5182b7af /llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
parent: 73aba6229f7f6cdc1aa5b107518684a95da4851e (diff)
download: bcm5719-llvm-f867a40bf60ad813560fe4cc3d2cc100472ffef4.tar.gz
bcm5719-llvm-f867a40bf60ad813560fe4cc3d2cc100472ffef4.zip
1 files changed, 19 insertions, 5 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 6915191665f..99fe96c0be2 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -141,6 +141,18 @@ static void addDefsToList(const MachineInstr &MI,
   }
 }
 
+static bool memAccessesCanBeReordered(
+  MachineBasicBlock::iterator A,
+  MachineBasicBlock::iterator B,
+  const SIInstrInfo *TII,
+  llvm::AliasAnalysis * AA) {
+  return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+    // RAW or WAR - cannot reorder
+    // WAW - cannot reorder
+    // RAR - safe to reorder
+    !(A->mayStore() || B->mayStore()));
+}
+
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
 static bool
@@ -173,8 +185,8 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
   for (MachineInstr *InstToMove : InstsToMove) {
     if (!InstToMove->mayLoadOrStore())
       continue;
-    if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
-      return false;
+    if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+        return false;
   }
   return true;
 }
@@ -233,7 +245,7 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
         return E;
 
       if (MBBI->mayLoadOrStore() &&
-          !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
+        !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
@@ -288,8 +300,10 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
     // We could potentially keep looking, but we'd need to make sure that
     // it was safe to move I and also all the instruction in InstsToMove
     // down past this instruction.
-    // FIXME: This is too conservative.
-    break;
+    if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) ||   // check if we can move I across MBBI
+      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+     )
+      break;
   }
   return E;
 }
author	Alexander Timofeev <Alexander.Timofeev@amd.com>	2016-11-03 14:37:13 +0000
committer	Alexander Timofeev <Alexander.Timofeev@amd.com>	2016-11-03 14:37:13 +0000
commit	f867a40bf60ad813560fe4cc3d2cc100472ffef4 (patch)
tree	e888ef6d503dc980fc536452f72a71ab5182b7af /llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
parent	73aba6229f7f6cdc1aa5b107518684a95da4851e (diff)
download	bcm5719-llvm-f867a40bf60ad813560fe4cc3d2cc100472ffef4.tar.gz bcm5719-llvm-f867a40bf60ad813560fe4cc3d2cc100472ffef4.zip