8 files changed, 837 insertions, 0 deletions
diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt
index 795441122cb..e368b75903e 100644
--- a/compiler-rt/lib/xray/CMakeLists.txt
+++ b/compiler-rt/lib/xray/CMakeLists.txt
@@ -18,6 +18,8 @@ set(XRAY_BASIC_MODE_SOURCES
     xray_basic_flags.cc
     xray_basic_logging.cc)
 
+set(XRAY_PROFILER_MODE_SOURCES
+    xray_profiler_flags.cc)
 
 # Implementation files for all XRay architectures.
 set(x86_64_SOURCES
@@ -98,6 +100,12 @@ if (APPLE)
     SOURCES ${XRAY_BASIC_MODE_SOURCES}
     CFLAGS ${XRAY_CFLAGS}
     DEFS ${XRAY_COMMON_DEFINITIONS})
+  add_compiler_rt_object_libraries(RTXrayPROFILER
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    SOURCES ${XRAY_PROFILER_MODE_SOURCES}
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS})
 
   # We only support running on osx for now.
   add_compiler_rt_runtime(clang_rt.xray
@@ -132,6 +140,16 @@ if (APPLE)
     LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
     LINK_LIBS ${XRAY_LINK_LIBS}
     PARENT_TARGET xray)
+  add_compiler_rt_runtime(clang_rt.xray-profiler
+    STATIC
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_SUPPORTED_ARCH}
+    OBJECT_LIBS RTXrayPROFILER
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_LIBS ${XRAY_LINK_LIBS}
+    PARENT_TARGET xray)
 else() # not Apple
   foreach(arch ${XRAY_SUPPORTED_ARCH})
     if(NOT CAN_TARGET_${arch})
@@ -149,6 +167,10 @@ else() # not Apple
       ARCHS ${arch}
       SOURCES ${XRAY_BASIC_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS}
       DEFS ${XRAY_COMMON_DEFINITIONS})
+    add_compiler_rt_object_libraries(RTXrayPROFILER
+      ARCHS ${arch}
+      SOURCES ${XRAY_PROFILER_MODE_SOURCES} CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS})
 
     # Common XRay archive for instrumented binaries.
     add_compiler_rt_runtime(clang_rt.xray
@@ -174,6 +196,13 @@ else() # not Apple
       DEFS ${XRAY_COMMON_DEFINITIONS}
       OBJECT_LIBS RTXrayBASIC
       PARENT_TARGET xray)
+    add_compiler_rt_runtime(clang_rt.xray-profiler
+      STATIC
+      ARCHS ${arch}
+      CFLAGS ${XRAY_CFLAGS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      OBJECT_LIBS RTXrayPROFILER
+      PARENT_TARGET xray)
   endforeach()
 endif() # not Apple
 
diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt
index f3cc85fc79f..eeca10dc099 100644
--- a/compiler-rt/lib/xray/tests/CMakeLists.txt
+++ b/compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -72,6 +72,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS)
     add_xray_lib("RTXRay.test.osx"
       $<TARGET_OBJECTS:RTXray.osx>
       $<TARGET_OBJECTS:RTXrayFDR.osx>
+      $<TARGET_OBJECTS:RTXrayPROFILER.osx>
       $<TARGET_OBJECTS:RTSanitizerCommon.osx>
       $<TARGET_OBJECTS:RTSanitizerCommonLibc.osx>)
   else()
@@ -79,6 +80,7 @@ if(COMPILER_RT_CAN_EXECUTE_TESTS)
     add_xray_lib("RTXRay.test.${arch}"
       $<TARGET_OBJECTS:RTXray.${arch}>
       $<TARGET_OBJECTS:RTXrayFDR.${arch}>
+      $<TARGET_OBJECTS:RTXrayPROFILER.${arch}>
       $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
       $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>)
   endforeach()
diff --git a/compiler-rt/lib/xray/tests/unit/CMakeLists.txt b/compiler-rt/lib/xray/tests/unit/CMakeLists.txt
index 02fd6f1aee7..abe43202fe3 100644
--- a/compiler-rt/lib/xray/tests/unit/CMakeLists.txt
+++ b/compiler-rt/lib/xray/tests/unit/CMakeLists.txt
@@ -6,3 +6,5 @@ add_xray_unittest(XRayAllocatorTest SOURCES
   allocator_test.cc xray_unit_test_main.cc)
 add_xray_unittest(XRaySegmentedArrayTest SOURCES
   segmented_array_test.cc xray_unit_test_main.cc)
+add_xray_unittest(XRayFunctionCallTrieTest SOURCES
+  function_call_trie_test.cc xray_unit_test_main.cc)
diff --git a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
new file mode 100644
index 00000000000..3aaa9880490
--- /dev/null
+++ b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
@@ -0,0 +1,253 @@
+//===-- function_call_trie_test.cc ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "gtest/gtest.h"
+
+#include "xray_function_call_trie.h"
+
+namespace __xray {
+
+namespace {
+
+TEST(FunctionCallTrieTest, Construction) {
+  // We want to make sure that we can create one of these without the set of
+  // allocators we need. This will by default use the global allocators.
+  FunctionCallTrie Trie;
+}
+
+TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
+  // FIXME: Support passing in configuration for allocators in the allocator
+  // constructors.
+  profilerFlags()->setDefaults();
+  FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(Allocators);
+}
+
+TEST(FunctionCallTrieTest, EnterAndExitFunction) {
+  profilerFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 1);
+  Trie.exitFunction(1, 2);
+
+  // We need a way to pull the data out. At this point, until we get a data
+  // collection service implemented, we're going to export the data as a list of
+  // roots, and manually walk through the structure ourselves.
+
+  const auto &R = Trie.getRoots();
+
+  ASSERT_EQ(R.size(), 1u);
+  ASSERT_EQ(R.front()->FId, 1);
+  ASSERT_EQ(R.front()->CallCount, 1);
+  ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, MissingFunctionEntry) {
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.exitFunction(1, 1);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_TRUE(R.empty());
+}
+
+TEST(FunctionCallTrieTest, MissingFunctionExit) {
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+  Trie.enterFunction(1, 1);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_TRUE(R.empty());
+}
+
+TEST(FunctionCallTrieTest, MultipleRoots) {
+  profilerFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  // Enter and exit FId = 1.
+  Trie.enterFunction(1, 1);
+  Trie.exitFunction(1, 2);
+
+  // Enter and exit FId = 2.
+  Trie.enterFunction(2, 3);
+  Trie.exitFunction(2, 4);
+
+  const auto &R = Trie.getRoots();
+  ASSERT_FALSE(R.empty());
+  ASSERT_EQ(R.size(), 2u);
+
+  // Make sure the roots have different IDs.
+  const auto R0 = R[0];
+  const auto R1 = R[1];
+  ASSERT_NE(R0->FId, R1->FId);
+
+  // Inspect the roots that they have the right data.
+  ASSERT_NE(R0, nullptr);
+  EXPECT_EQ(R0->CallCount, 1u);
+  EXPECT_EQ(R0->CumulativeLocalTime, 1u);
+
+  ASSERT_NE(R1, nullptr);
+  EXPECT_EQ(R1->CallCount, 1u);
+  EXPECT_EQ(R1->CumulativeLocalTime, 1u);
+}
+
+// While missing an intermediary entry may be rare in practice, we still enforce
+// that we can handle the case where we've missed the entry event somehow, in
+// between call entry/exits. To illustrate, imagine the following shadow call
+// stack:
+//
+//   f0@t0 -> f1@t1 -> f2@t2
+//
+// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for
+// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of
+// accounting local time to `f2` from d = (t3 - t2), then local time to `f1`
+// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'.
+TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
+  profilerFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 0);
+  Trie.enterFunction(2, 100);
+  Trie.enterFunction(3, 200);
+  Trie.exitFunction(3, 300);
+  Trie.exitFunction(1, 400);
+
+  // What we should see at this point is all the functions in the trie in a
+  // specific order (1 -> 2 -> 3) with the appropriate count(s) and local
+  // latencies.
+  const auto &R = Trie.getRoots();
+  ASSERT_FALSE(R.empty());
+  ASSERT_EQ(R.size(), 1u);
+
+  const auto &F1 = *R[0];
+  ASSERT_EQ(F1.FId, 1);
+  ASSERT_FALSE(F1.Callees.empty());
+
+  const auto &F2 = *F1.Callees[0].NodePtr;
+  ASSERT_EQ(F2.FId, 2);
+  ASSERT_FALSE(F2.Callees.empty());
+
+  const auto &F3 = *F2.Callees[0].NodePtr;
+  ASSERT_EQ(F3.FId, 3);
+  ASSERT_TRUE(F3.Callees.empty());
+
+  // Now that we've established the preconditions, we check for specific aspects
+  // of the nodes.
+  EXPECT_EQ(F3.CallCount, 1);
+  EXPECT_EQ(F2.CallCount, 1);
+  EXPECT_EQ(F1.CallCount, 1);
+  EXPECT_EQ(F3.CumulativeLocalTime, 100);
+  EXPECT_EQ(F2.CumulativeLocalTime, 300);
+  EXPECT_EQ(F1.CumulativeLocalTime, 100);
+}
+
+// TODO: Test that we can handle cross-CPU migrations, where TSCs are not
+// guaranteed to be synchronised.
+TEST(FunctionCallTrieTest, DeepCopy) {
+  profilerFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 0);
+  Trie.enterFunction(2, 1);
+  Trie.exitFunction(2, 2);
+  Trie.enterFunction(3, 3);
+  Trie.exitFunction(3, 4);
+  Trie.exitFunction(1, 5);
+
+  // We want to make a deep copy and compare notes.
+  auto B = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Copy(B);
+  Trie.deepCopyInto(Copy);
+
+  ASSERT_NE(Trie.getRoots().size(), 0u);
+  ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size());
+  const auto &R0Orig = *Trie.getRoots()[0];
+  const auto &R0Copy = *Copy.getRoots()[0];
+  EXPECT_EQ(R0Orig.FId, 1);
+  EXPECT_EQ(R0Orig.FId, R0Copy.FId);
+
+  ASSERT_EQ(R0Orig.Callees.size(), 2u);
+  ASSERT_EQ(R0Copy.Callees.size(), 2u);
+
+  const auto &F1Orig =
+      *R0Orig.Callees
+           .find_element(
+               [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
+           ->NodePtr;
+  const auto &F1Copy =
+      *R0Copy.Callees
+           .find_element(
+               [](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
+           ->NodePtr;
+  EXPECT_EQ(&R0Orig, F1Orig.Parent);
+  EXPECT_EQ(&R0Copy, F1Copy.Parent);
+}
+
+TEST(FunctionCallTrieTest, MergeInto) {
+  profilerFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie T0(A);
+  FunctionCallTrie T1(A);
+
+  // 1 -> 2 -> 3
+  T0.enterFunction(1, 0);
+  T0.enterFunction(2, 1);
+  T0.enterFunction(3, 2);
+  T0.exitFunction(3, 3);
+  T0.exitFunction(2, 4);
+  T0.exitFunction(1, 5);
+
+  // 1 -> 2 -> 3
+  T1.enterFunction(1, 0);
+  T1.enterFunction(2, 1);
+  T1.enterFunction(3, 2);
+  T1.exitFunction(3, 3);
+  T1.exitFunction(2, 4);
+  T1.exitFunction(1, 5);
+
+  // We use a different allocator here to make sure that we're able to transfer
+  // data into a FunctionCallTrie which uses a different allocator. This
+  // reflects the inteded usage scenario for when we're collecting profiles that
+  // aggregate across threads.
+  auto B = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Merged(B);
+
+  T0.mergeInto(Merged);
+  T1.mergeInto(Merged);
+
+  ASSERT_EQ(Merged.getRoots().size(), 1u);
+  const auto &R0 = *Merged.getRoots()[0];
+  EXPECT_EQ(R0.FId, 1);
+  EXPECT_EQ(R0.CallCount, 2);
+  EXPECT_EQ(R0.CumulativeLocalTime, 10);
+  EXPECT_EQ(R0.Callees.size(), 1u);
+
+  const auto &F1 = *R0.Callees[0].NodePtr;
+  EXPECT_EQ(F1.FId, 2);
+  EXPECT_EQ(F1.CallCount, 2);
+  EXPECT_EQ(F1.CumulativeLocalTime, 6);
+  EXPECT_EQ(F1.Callees.size(), 1u);
+
+  const auto &F2 = *F1.Callees[0].NodePtr;
+  EXPECT_EQ(F2.FId, 3);
+  EXPECT_EQ(F2.CallCount, 2);
+  EXPECT_EQ(F2.CumulativeLocalTime, 2);
+  EXPECT_EQ(F2.Callees.size(), 0u);
+}
+
+} // namespace
+
+} // namespace __xray
diff --git a/compiler-rt/lib/xray/xray_function_call_trie.h b/compiler-rt/lib/xray/xray_function_call_trie.h
new file mode 100644
index 00000000000..af262f847df
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_function_call_trie.h
@@ -0,0 +1,446 @@
+//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a function call trie.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FUNCTION_CALL_TRIE_H
+#define XRAY_FUNCTION_CALL_TRIE_H
+
+#include "xray_profiler_flags.h"
+#include "xray_segmented_array.h"
+#include <utility>
+
+namespace __xray {
+
+/// A FunctionCallTrie represents the stack traces of XRay instrumented
+/// functions that we've encountered, where a node corresponds to a function and
+/// the path from the root to the node its stack trace. Each node in the trie
+/// will contain some useful values, including:
+///
+///   * The cumulative amount of time spent in this particular node/stack.
+///   * The number of times this stack has appeared.
+///   * A histogram of latencies for that particular node.
+///
+/// Each node in the trie will also contain a list of callees, represented using
+/// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function
+/// ID of the callee, and a pointer to the node.
+///
+/// If we visualise this data structure, we'll find the following potential
+/// representation:
+///
+///   [function id node] -> [callees] [cumulative time]
+///                         [call counter] [latency histogram]
+///
+/// As an example, when we have a function in this pseudocode:
+///
+///   func f(N) {
+///     g()
+///     h()
+///     for i := 1..N { j() }
+///   }
+///
+/// We may end up with a trie of the following form:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ ... ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///
+/// If for instance the function g() called j() like so:
+///
+///   func g() {
+///     for i := 1..10 { j() }
+///   }
+///
+/// We'll find the following updated trie:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ j' ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///   j' -> [ ... ] [...] [10] [...]
+///
+/// Note that we'll have a new node representing the path `f -> g -> j'` with
+/// isolated data. This isolation gives us a means of representing the stack
+/// traces as a path, as opposed to a key in a table. The alternative
+/// implementation here would be to use a separate table for the path, and use
+/// hashes of the path as an identifier to accumulate the information. We've
+/// moved away from this approach as it takes a lot of time to compute the hash
+/// every time we need to update a function's call information as we're handling
+/// the entry and exit events.
+///
+/// This approach allows us to maintain a shadow stack, which represents the
+/// currently executing path, and on function exits quickly compute the amount
+/// of time elapsed from the entry, then update the counters for the node
+/// already represented in the trie. This necessitates an efficient
+/// representation of the various data structures (the list of callees must be
+/// cache-aware and efficient to look up, and the histogram must be compact and
+/// quick to update) to enable us to keep the overheads of this implementation
+/// to the minimum.
+class FunctionCallTrie {
+public:
+  struct Node;
+
+  // We use a NodeIdPair type instead of a std::pair<...> to not rely on the
+  // standard library types in this header.
+  struct NodeIdPair {
+    Node *NodePtr;
+    int32_t FId;
+
+    // Constructor for inplace-construction.
+    NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
+  };
+
+  using NodeIdPairArray = Array<NodeIdPair>;
+  using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
+
+  // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
+  // number of times this node actually appeared, the cumulative amount of time
+  // for this particular node including its children call times, and just the
+  // local time spent on this node. Each Node will have the ID of the XRay
+  // instrumented function that it is associated to.
+  struct Node {
+    Node *Parent;
+    NodeIdPairArray Callees;
+    int64_t CallCount;
+    int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+    int32_t FId;
+
+    // We add a constructor here to allow us to inplace-construct through
+    // Array<...>'s AppendEmplace.
+    Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+         int32_t F)
+        : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
+          FId(F) {}
+
+    // TODO: Include the compact histogram.
+  };
+
+private:
+  struct ShadowStackEntry {
+    int32_t FId; // We're copying the function ID into the stack to avoid having
+                 // to reach into the node just to get the function ID.
+    uint64_t EntryTSC;
+    Node *NodePtr;
+
+    // We add a constructor here to allow us to inplace-construct through
+    // Array<...>'s AppendEmplace.
+    ShadowStackEntry(int32_t F, uint64_t T, Node *N)
+        : FId(F), EntryTSC(T), NodePtr(N) {}
+  };
+
+  using NodeArray = Array<Node>;
+  using RootArray = Array<Node *>;
+  using ShadowStackArray = Array<ShadowStackEntry>;
+
+public:
+  // We collate the allocators we need into a single struct, as a convenience to
+  // allow us to initialize these as a group.
+  struct Allocators {
+    using NodeAllocatorType = NodeArray::AllocatorType;
+    using RootAllocatorType = RootArray::AllocatorType;
+    using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
+    using NodeIdPairAllocatorType = NodeIdPairAllocatorType;
+
+    NodeAllocatorType *NodeAllocator = nullptr;
+    RootAllocatorType *RootAllocator = nullptr;
+    ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
+    NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+
+    Allocators() {}
+    Allocators(const Allocators &) = delete;
+    Allocators &operator=(const Allocators &) = delete;
+
+    Allocators(Allocators &&O)
+        : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
+          ShadowStackAllocator(O.ShadowStackAllocator),
+          NodeIdPairAllocator(O.NodeIdPairAllocator) {
+      O.NodeAllocator = nullptr;
+      O.RootAllocator = nullptr;
+      O.ShadowStackAllocator = nullptr;
+      O.NodeIdPairAllocator = nullptr;
+    }
+
+    Allocators &operator=(Allocators &&O) {
+      {
+        auto Tmp = O.NodeAllocator;
+        O.NodeAllocator = this->NodeAllocator;
+        this->NodeAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.RootAllocator;
+        O.RootAllocator = this->RootAllocator;
+        this->RootAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.ShadowStackAllocator;
+        O.ShadowStackAllocator = this->ShadowStackAllocator;
+        this->ShadowStackAllocator = Tmp;
+      }
+      {
+        auto Tmp = O.NodeIdPairAllocator;
+        O.NodeIdPairAllocator = this->NodeIdPairAllocator;
+        this->NodeIdPairAllocator = Tmp;
+      }
+      return *this;
+    }
+
+    ~Allocators() {
+      // Note that we cannot use delete on these pointers, as they need to be
+      // returned to the sanitizer_common library's internal memory tracking
+      // system.
+      if (NodeAllocator != nullptr) {
+        NodeAllocator->~NodeAllocatorType();
+        InternalFree(NodeAllocator);
+      }
+      if (RootAllocator != nullptr) {
+        RootAllocator->~RootAllocatorType();
+        InternalFree(RootAllocator);
+      }
+      if (ShadowStackAllocator != nullptr) {
+        ShadowStackAllocator->~ShadowStackAllocatorType();
+        InternalFree(ShadowStackAllocator);
+      }
+      if (NodeIdPairAllocator != nullptr) {
+        NodeIdPairAllocator->~NodeIdPairAllocatorType();
+        InternalFree(NodeIdPairAllocator);
+      }
+    }
+  };
+
+  // TODO: Support configuration of options through the arguments.
+  static Allocators InitAllocators() {
+    Allocators A;
+    auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
+        InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
+    new (NodeAllocator) Allocators::NodeAllocatorType(
+        profilerFlags()->per_thread_allocator_max, 0);
+    A.NodeAllocator = NodeAllocator;
+
+    auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
+        InternalAlloc(sizeof(Allocators::RootAllocatorType)));
+    new (RootAllocator) Allocators::RootAllocatorType(
+        profilerFlags()->per_thread_allocator_max, 0);
+    A.RootAllocator = RootAllocator;
+
+    auto ShadowStackAllocator =
+        reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
+            InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
+    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(
+        profilerFlags()->per_thread_allocator_max, 0);
+    A.ShadowStackAllocator = ShadowStackAllocator;
+
+    auto NodeIdPairAllocator =
+        reinterpret_cast<Allocators::NodeIdPairAllocatorType *>(
+            InternalAlloc(sizeof(Allocators::NodeIdPairAllocatorType)));
+    new (NodeIdPairAllocator) Allocators::NodeIdPairAllocatorType(
+        profilerFlags()->per_thread_allocator_max, 0);
+    A.NodeIdPairAllocator = NodeIdPairAllocator;
+    return A;
+  }
+
+private:
+  NodeArray Nodes;
+  RootArray Roots;
+  ShadowStackArray ShadowStack;
+  NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+
+  const Allocators &GetGlobalAllocators() {
+    static const Allocators A = [] { return InitAllocators(); }();
+    return A;
+  }
+
+public:
+  explicit FunctionCallTrie(const Allocators &A)
+      : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+        ShadowStack(*A.ShadowStackAllocator),
+        NodeIdPairAllocator(A.NodeIdPairAllocator) {}
+
+  FunctionCallTrie() : FunctionCallTrie(GetGlobalAllocators()) {}
+
+  void enterFunction(int32_t FId, uint64_t TSC) {
+    // This function primarily deals with ensuring that the ShadowStack is
+    // consistent and ready for when an exit event is encountered.
+    if (UNLIKELY(ShadowStack.empty())) {
+      auto NewRoot =
+          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
+      Roots.Append(NewRoot);
+      ShadowStack.AppendEmplace(FId, TSC, NewRoot);
+      return;
+    }
+
+    auto &Top = ShadowStack.back();
+    auto TopNode = Top.NodePtr;
+
+    // If we've seen this callee before, then we just access that node and place
+    // that on the top of the stack.
+    auto Callee = TopNode->Callees.find_element(
+        [FId](const NodeIdPair &NR) { return NR.FId == FId; });
+    if (Callee != nullptr) {
+      CHECK_NE(Callee->NodePtr, nullptr);
+      ShadowStack.AppendEmplace(FId, TSC, Callee->NodePtr);
+      return;
+    }
+
+    // This means we've never seen this stack before, create a new node here.
+    auto NewNode =
+        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+    if (UNLIKELY(NewNode == nullptr))
+      return;
+    TopNode->Callees.AppendEmplace(NewNode, FId);
+    ShadowStack.AppendEmplace(FId, TSC, NewNode);
+    return;
+  }
+
+  void exitFunction(int32_t FId, uint64_t TSC) {
+    // When we exit a function, we look up the ShadowStack to see whether we've
+    // entered this function before. We do as little processing here as we can,
+    // since most of the hard work would have already been done at function
+    // entry.
+    if (UNLIKELY(ShadowStack.empty()))
+      return;
+
+    uint64_t CumulativeTreeTime = 0;
+    while (!ShadowStack.empty()) {
+      auto &Top = ShadowStack.back();
+      auto TopNode = Top.NodePtr;
+      auto TopFId = TopNode->FId;
+      auto LocalTime = TSC - Top.EntryTSC;
+      TopNode->CallCount++;
+      TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
+      CumulativeTreeTime += LocalTime;
+      ShadowStack.trim(1);
+
+      // TODO: Update the histogram for the node.
+      if (TopFId == FId)
+        break;
+    }
+  }
+
+  const RootArray &getRoots() const { return Roots; }
+
+  // The deepCopyInto operation will update the provided FunctionCallTrie by
+  // re-creating the contents of this particular FunctionCallTrie in the other
+  // FunctionCallTrie. It will do this using a Depth First Traversal from the
+  // roots, and while doing so recreating the traversal in the provided
+  // FunctionCallTrie.
+  //
+  // This operation will *not* destroy the state in `O`, and thus may cause some
+  // duplicate entries in `O` if it is not empty.
+  //
+  // This function is *not* thread-safe, and may require external
+  // synchronisation of both "this" and |O|.
+  //
+  // This function must *not* be called with a non-empty FunctionCallTrie |O|.
+  void deepCopyInto(FunctionCallTrie &O) const {
+    DCHECK(O.getRoots().empty());
+    for (const auto Root : getRoots()) {
+      // Add a node in O for this root.
+      auto NewRoot = O.Nodes.AppendEmplace(
+          nullptr, *O.NodeIdPairAllocator, Root->CallCount,
+          Root->CumulativeLocalTime, Root->FId);
+      O.Roots.Append(NewRoot);
+
+      // We then push the root into a stack, to use as the parent marker for new
+      // nodes we push in as we're traversing depth-first down the call tree.
+      struct NodeAndParent {
+        FunctionCallTrie::Node *Node;
+        FunctionCallTrie::Node *NewNode;
+      };
+      using Stack = Array<NodeAndParent>;
+
+      typename Stack::AllocatorType StackAllocator(
+          profilerFlags()->stack_allocator_max, 0);
+      Stack DFSStack(StackAllocator);
+
+      // TODO: Figure out what to do if we fail to allocate any more stack
+      // space. Maybe warn or report once?
+      DFSStack.Append(NodeAndParent{Root, NewRoot});
+      while (!DFSStack.empty()) {
+        NodeAndParent NP = DFSStack.back();
+        DCHECK_NE(NP.Node, nullptr);
+        DCHECK_NE(NP.NewNode, nullptr);
+        DFSStack.trim(1);
+        for (const auto Callee : NP.Node->Callees) {
+          auto NewNode = O.Nodes.AppendEmplace(
+              NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
+              Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+          DCHECK_NE(NewNode, nullptr);
+          NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
+          DFSStack.Append(NodeAndParent{Callee.NodePtr, NewNode});
+        }
+      }
+    }
+  }
+
+  // The mergeInto operation will update the provided FunctionCallTrie by
+  // traversing the current trie's roots and updating (i.e. merging) the data in
+  // the nodes with the data in the target's nodes. If the node doesn't exist in
+  // the provided trie, we add a new one in the right position, and inherit the
+  // data from the original (current) trie, along with all its callees.
+  //
+  // This function is *not* thread-safe, and may require external
+  // synchronisation of both "this" and |O|.
+  void mergeInto(FunctionCallTrie &O) const {
+    struct NodeAndTarget {
+      FunctionCallTrie::Node *OrigNode;
+      FunctionCallTrie::Node *TargetNode;
+    };
+    using Stack = Array<NodeAndTarget>;
+    typename Stack::AllocatorType StackAllocator(
+        profilerFlags()->stack_allocator_max, 0);
+    Stack DFSStack(StackAllocator);
+
+    for (const auto Root : getRoots()) {
+      Node *TargetRoot = nullptr;
+      auto R = O.Roots.find_element(
+          [&](const Node *Node) { return Node->FId == Root->FId; });
+      if (R == nullptr) {
+        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
+                                           0, Root->FId);
+        O.Roots.Append(TargetRoot);
+      } else {
+        TargetRoot = *R;
+      }
+
+      DFSStack.Append(NodeAndTarget{Root, TargetRoot});
+      while (!DFSStack.empty()) {
+        NodeAndTarget NT = DFSStack.back();
+        DCHECK_NE(NT.OrigNode, nullptr);
+        DCHECK_NE(NT.TargetNode, nullptr);
+        DFSStack.trim(1);
+        // TODO: Update the histogram as well when we have it ready.
+        NT.TargetNode->CallCount += NT.OrigNode->CallCount;
+        NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;
+        for (const auto Callee : NT.OrigNode->Callees) {
+          auto TargetCallee = NT.TargetNode->Callees.find_element(
+              [&](const FunctionCallTrie::NodeIdPair &C) {
+                return C.FId == Callee.FId;
+              });
+          if (TargetCallee == nullptr) {
+            auto NewTargetNode = O.Nodes.AppendEmplace(
+                NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+            TargetCallee =
+                NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
+          }
+          DFSStack.Append(NodeAndTarget{Callee.NodePtr, TargetCallee->NodePtr});
+        }
+      }
+    }
+  }
+};
+
+} // namespace __xray
+
+#endif // XRAY_FUNCTION_CALL_TRIE_H
diff --git a/compiler-rt/lib/xray/xray_profiler_flags.cc b/compiler-rt/lib/xray/xray_profiler_flags.cc
new file mode 100644
index 00000000000..7dc20cedf6f
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_profiler_flags.cc
@@ -0,0 +1,40 @@
+//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay runtime flags.
+//===----------------------------------------------------------------------===//
+
+#include "xray_profiler_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+namespace __xray {
+
+// Storage for the profiler flags.
+ProfilerFlags xray_profiler_flags_dont_use_directly;
+
+void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_profiler_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerProfilerFlags(FlagParser *P,
+                           ProfilerFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_profiler_flags.inc"
+#undef XRAY_FLAG
+}
+
+} // namespace __xray
diff --git a/compiler-rt/lib/xray/xray_profiler_flags.h b/compiler-rt/lib/xray/xray_profiler_flags.h
new file mode 100644
index 00000000000..9a57abb26ab
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_profiler_flags.h
@@ -0,0 +1,39 @@
+//===-- xray_profiler_flags.h ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay profiler runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_PROFILER_FLAGS_H
+#define XRAY_PROFILER_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct ProfilerFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_profiler_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern ProfilerFlags xray_profiler_flags_dont_use_directly;
+inline ProfilerFlags *profilerFlags() {
+  return &xray_profiler_flags_dont_use_directly;
+}
+void registerProfilerFlags(FlagParser *P, ProfilerFlags *F);
+
+} // namespace __xray
+
+#endif // XRAY_PROFILER_FLAGS_H
diff --git a/compiler-rt/lib/xray/xray_profiler_flags.inc b/compiler-rt/lib/xray/xray_profiler_flags.inc
new file mode 100644
index 00000000000..9266598a19c
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_profiler_flags.inc
@@ -0,0 +1,26 @@
+//===-- xray_flags.inc ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay profiling runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20,
+          "Maximum size of any single per-thread allocator.")
+XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
+          "Maximum size of the global allocator for profile storage.")
+XRAY_FLAG(uptr, stack_allocator_max, 2 << 24,
+          "Maximum size of the traversal stack allocator.")
+XRAY_FLAG(int, grace_period_ms, 100,
+          "Profile collection will wait this much time in milliseconds before "
+          "resetting the global state. This gives a chance to threads to "
+          "notice that the profiler has been finalized and clean up.")