summaryrefslogtreecommitdiffstats
path: root/compiler-rt/lib/xray
diff options
context:
space:
mode:
authorDean Michael Berris <dberris@google.com>2018-12-07 06:23:06 +0000
committerDean Michael Berris <dberris@google.com>2018-12-07 06:23:06 +0000
commit25d505953a34b526d9985f2e631128069a9c22d7 (patch)
tree6e0d03fe0c2d18b21eae96c4145ef6feb1f08159 /compiler-rt/lib/xray
parenta523a211754514ef1c5c84778e38e0a8da335500 (diff)
downloadbcm5719-llvm-25d505953a34b526d9985f2e631128069a9c22d7.tar.gz
bcm5719-llvm-25d505953a34b526d9985f2e631128069a9c22d7.zip
[XRay] Use preallocated memory for XRay profiling
Summary: This change builds upon D54989, which removes memory allocation from the critical path of the profiling implementation. This also changes the API for the profile collection service, to take ownership of the memory and associated data structures per-thread. The consolidation of the memory allocation allows us to do two things: - Limits the amount of memory used by the profiling implementation, associating preallocated buffers instead of allocating memory on-demand. - Consolidate the memory initialisation and cleanup by relying on the buffer queue's reference counting implementation. We find a number of places which also display some problematic behaviour, including: - Off-by-factor bug in the allocator implementation. - Unrolling semantics in cases of "memory exhausted" situations, when managing the state of the function call trie. We also add a few test cases which verify our understanding of the behaviour of the system, with important edge-cases (especially for memory-exhausted cases) in the segmented array and profile collector unit tests. Depends on D54989. Reviewers: mboerger Subscribers: dschuff, mgorny, dmgreen, jfb, llvm-commits Differential Revision: https://reviews.llvm.org/D55249 llvm-svn: 348568
Diffstat (limited to 'compiler-rt/lib/xray')
-rw-r--r--compiler-rt/lib/xray/CMakeLists.txt2
-rw-r--r--compiler-rt/lib/xray/tests/unit/allocator_test.cc22
-rw-r--r--compiler-rt/lib/xray/tests/unit/profile_collector_test.cc49
-rw-r--r--compiler-rt/lib/xray/tests/unit/segmented_array_test.cc38
-rw-r--r--compiler-rt/lib/xray/xray_allocator.h22
-rw-r--r--compiler-rt/lib/xray/xray_function_call_trie.h85
-rw-r--r--compiler-rt/lib/xray/xray_profile_collector.cc225
-rw-r--r--compiler-rt/lib/xray/xray_profile_collector.h26
-rw-r--r--compiler-rt/lib/xray/xray_profiling.cc134
-rw-r--r--compiler-rt/lib/xray/xray_profiling_flags.inc5
-rw-r--r--compiler-rt/lib/xray/xray_segmented_array.h2
11 files changed, 460 insertions, 150 deletions
diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt
index 541e181afbd..0a86c52e620 100644
--- a/compiler-rt/lib/xray/CMakeLists.txt
+++ b/compiler-rt/lib/xray/CMakeLists.txt
@@ -2,6 +2,7 @@
# XRay runtime library implementation files.
set(XRAY_SOURCES
+ xray_buffer_queue.cc
xray_init.cc
xray_flags.cc
xray_interface.cc
@@ -11,7 +12,6 @@ set(XRAY_SOURCES
# Implementation files for all XRay modes.
set(XRAY_FDR_MODE_SOURCES
xray_fdr_flags.cc
- xray_buffer_queue.cc
xray_fdr_logging.cc)
set(XRAY_BASIC_MODE_SOURCES
diff --git a/compiler-rt/lib/xray/tests/unit/allocator_test.cc b/compiler-rt/lib/xray/tests/unit/allocator_test.cc
index 0177798b069..1170741623c 100644
--- a/compiler-rt/lib/xray/tests/unit/allocator_test.cc
+++ b/compiler-rt/lib/xray/tests/unit/allocator_test.cc
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "xray_allocator.h"
+#include "xray_buffer_queue.h"
#include "gtest/gtest.h"
namespace __xray {
@@ -56,5 +57,26 @@ TEST(AllocatorTest, AllocateBoundaries) {
ASSERT_EQ(C, Expected);
}
+TEST(AllocatorTest, AllocateFromNonOwned) {
+ bool Success = false;
+ BufferQueue BQ(GetPageSizeCached(), 10, Success);
+ ASSERT_TRUE(Success);
+ BufferQueue::Buffer B;
+ ASSERT_EQ(BQ.getBuffer(B), BufferQueue::ErrorCode::Ok);
+ {
+ Allocator<sizeof(OddSizedData)> A(B.Data, B.Size);
+
+ // Keep allocating until we hit a nullptr block.
+ unsigned C = 0;
+ auto Expected =
+ GetPageSizeCached() / RoundUpTo(sizeof(OddSizedData), kCacheLineSize);
+ for (auto B = A.Allocate(); B.Data != nullptr; B = A.Allocate(), ++C)
+ ;
+
+ ASSERT_EQ(C, Expected);
+ }
+ ASSERT_EQ(BQ.releaseBuffer(B), BufferQueue::ErrorCode::Ok);
+}
+
} // namespace
} // namespace __xray
diff --git a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc
index f06b7027ee1..df786d46b9d 100644
--- a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc
+++ b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc
@@ -110,24 +110,31 @@ std::tuple<Profile, const char *> ParseProfile(const char *P) {
TEST(profileCollectorServiceTest, PostSerializeCollect) {
profilingFlags()->setDefaults();
- // The most basic use-case (the one we actually only care about) is the one
- // where we ensure that we can post FunctionCallTrie instances, which are then
- // destroyed but serialized properly.
- //
- // First, we initialise a set of allocators in the local scope. This ensures
- // that we're able to copy the contents of the FunctionCallTrie that uses
- // the local allocators.
- auto Allocators = FunctionCallTrie::InitAllocators();
+ bool Success = false;
+ BufferQueue BQ(profilingFlags()->per_thread_allocator_max,
+ profilingFlags()->buffers_max, Success);
+ ASSERT_EQ(Success, true);
+ FunctionCallTrie::Allocators::Buffers Buffers;
+ ASSERT_EQ(BQ.getBuffer(Buffers.NodeBuffer), BufferQueue::ErrorCode::Ok);
+ ASSERT_EQ(BQ.getBuffer(Buffers.RootsBuffer), BufferQueue::ErrorCode::Ok);
+ ASSERT_EQ(BQ.getBuffer(Buffers.ShadowStackBuffer),
+ BufferQueue::ErrorCode::Ok);
+ ASSERT_EQ(BQ.getBuffer(Buffers.NodeIdPairBuffer), BufferQueue::ErrorCode::Ok);
+ auto Allocators = FunctionCallTrie::InitAllocatorsFromBuffers(Buffers);
FunctionCallTrie T(Allocators);
- // Then, we populate the trie with some data.
+ // Populate the trie with some data.
T.enterFunction(1, 1, 0);
T.enterFunction(2, 2, 0);
T.exitFunction(2, 3, 0);
T.exitFunction(1, 4, 0);
+ // Reset the collector data structures.
+ profileCollectorService::reset();
+
// Then we post the data to the global profile collector service.
- profileCollectorService::post(T, 1);
+ profileCollectorService::post(&BQ, std::move(T), std::move(Allocators),
+ std::move(Buffers), 1);
// Then we serialize the data.
profileCollectorService::serialize();
@@ -174,7 +181,21 @@ TEST(profileCollectorServiceTest, PostSerializeCollect) {
// profileCollectorService. This simulates what the threads being profiled would
// be doing anyway, but through the XRay logging implementation.
void threadProcessing() {
- thread_local auto Allocators = FunctionCallTrie::InitAllocators();
+ static bool Success = false;
+ static BufferQueue BQ(profilingFlags()->per_thread_allocator_max,
+ profilingFlags()->buffers_max, Success);
+ thread_local FunctionCallTrie::Allocators::Buffers Buffers = [] {
+ FunctionCallTrie::Allocators::Buffers B;
+ BQ.getBuffer(B.NodeBuffer);
+ BQ.getBuffer(B.RootsBuffer);
+ BQ.getBuffer(B.ShadowStackBuffer);
+ BQ.getBuffer(B.NodeIdPairBuffer);
+ return B;
+ }();
+
+ thread_local auto Allocators =
+ FunctionCallTrie::InitAllocatorsFromBuffers(Buffers);
+
FunctionCallTrie T(Allocators);
T.enterFunction(1, 1, 0);
@@ -182,11 +203,15 @@ void threadProcessing() {
T.exitFunction(2, 3, 0);
T.exitFunction(1, 4, 0);
- profileCollectorService::post(T, GetTid());
+ profileCollectorService::post(&BQ, std::move(T), std::move(Allocators),
+ std::move(Buffers), GetTid());
}
TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) {
profilingFlags()->setDefaults();
+
+ profileCollectorService::reset();
+
std::thread t1(threadProcessing);
std::thread t2(threadProcessing);
diff --git a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
index 73120aafc8e..46aeb88f71b 100644
--- a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
+++ b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc
@@ -2,6 +2,9 @@
#include "xray_segmented_array.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include <algorithm>
+#include <numeric>
+#include <vector>
namespace __xray {
namespace {
@@ -307,5 +310,40 @@ TEST(SegmentedArrayTest, PlacementNewOnAlignedStorage) {
}
}
+TEST(SegmentedArrayTest, ArrayOfPointersIteratorAccess) {
+ using PtrArray = Array<int *>;
+ PtrArray::AllocatorType Alloc(16384);
+ Array<int *> A(Alloc);
+ static constexpr size_t Count = 100;
+ std::vector<int> Integers(Count);
+ std::iota(Integers.begin(), Integers.end(), 0);
+ for (auto &I : Integers)
+ ASSERT_NE(A.Append(&I), nullptr);
+ int V = 0;
+ ASSERT_EQ(A.size(), Count);
+ for (auto P : A) {
+ ASSERT_NE(P, nullptr);
+ ASSERT_EQ(*P, V++);
+ }
+}
+
+TEST(SegmentedArrayTest, ArrayOfPointersIteratorAccessExhaustion) {
+ using PtrArray = Array<int *>;
+ PtrArray::AllocatorType Alloc(4096);
+ Array<int *> A(Alloc);
+ static constexpr size_t Count = 1000;
+ std::vector<int> Integers(Count);
+ std::iota(Integers.begin(), Integers.end(), 0);
+ for (auto &I : Integers)
+ if (A.Append(&I) == nullptr)
+ break;
+ int V = 0;
+ ASSERT_LT(A.size(), Count);
+ for (auto P : A) {
+ ASSERT_NE(P, nullptr);
+ ASSERT_EQ(*P, V++);
+ }
+}
+
} // namespace
} // namespace __xray
diff --git a/compiler-rt/lib/xray/xray_allocator.h b/compiler-rt/lib/xray/xray_allocator.h
index 2ba937b4324..907c54542a5 100644
--- a/compiler-rt/lib/xray/xray_allocator.h
+++ b/compiler-rt/lib/xray/xray_allocator.h
@@ -175,6 +175,7 @@ private:
unsigned char *BackingStore = nullptr;
unsigned char *AlignedNextBlock = nullptr;
size_t AllocatedBlocks = 0;
+ bool Owned;
SpinMutex Mutex{};
void *Alloc() XRAY_NEVER_INSTRUMENT {
@@ -209,14 +210,14 @@ private:
0);
}
- if ((AllocatedBlocks * Block::Size) >= MaxMemory)
+ if (((AllocatedBlocks + 1) * Block::Size) > MaxMemory)
return nullptr;
// Align the pointer we'd like to return to an appropriate alignment, then
// advance the pointer from where to start allocations.
void *Result = AlignedNextBlock;
- AlignedNextBlock = reinterpret_cast<unsigned char *>(
- reinterpret_cast<unsigned char *>(AlignedNextBlock) + N);
+ AlignedNextBlock =
+ reinterpret_cast<unsigned char *>(AlignedNextBlock) + Block::Size;
++AllocatedBlocks;
return Result;
}
@@ -227,6 +228,15 @@ public:
BackingStore(nullptr),
AlignedNextBlock(nullptr),
AllocatedBlocks(0),
+ Owned(true),
+ Mutex() {}
+
+ explicit Allocator(void *P, size_t M) XRAY_NEVER_INSTRUMENT
+ : MaxMemory(M),
+ BackingStore(reinterpret_cast<unsigned char *>(P)),
+ AlignedNextBlock(reinterpret_cast<unsigned char *>(P)),
+ AllocatedBlocks(0),
+ Owned(false),
Mutex() {}
Allocator(const Allocator &) = delete;
@@ -243,6 +253,8 @@ public:
O.AlignedNextBlock = nullptr;
AllocatedBlocks = O.AllocatedBlocks;
O.AllocatedBlocks = 0;
+ Owned = O.Owned;
+ O.Owned = false;
}
Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT {
@@ -258,13 +270,15 @@ public:
O.AlignedNextBlock = nullptr;
AllocatedBlocks = O.AllocatedBlocks;
O.AllocatedBlocks = 0;
+ Owned = O.Owned;
+ O.Owned = false;
return *this;
}
Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; }
~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT {
- if (BackingStore != nullptr) {
+ if (Owned && BackingStore != nullptr) {
deallocateBuffer(BackingStore, MaxMemory);
}
}
diff --git a/compiler-rt/lib/xray/xray_function_call_trie.h b/compiler-rt/lib/xray/xray_function_call_trie.h
index d70667b5a7f..d01ad20e3d7 100644
--- a/compiler-rt/lib/xray/xray_function_call_trie.h
+++ b/compiler-rt/lib/xray/xray_function_call_trie.h
@@ -15,6 +15,7 @@
#ifndef XRAY_FUNCTION_CALL_TRIE_H
#define XRAY_FUNCTION_CALL_TRIE_H
+#include "xray_buffer_queue.h"
#include "xray_defs.h"
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
@@ -161,6 +162,35 @@ public:
Allocators(const Allocators &) = delete;
Allocators &operator=(const Allocators &) = delete;
+ struct Buffers {
+ BufferQueue::Buffer NodeBuffer;
+ BufferQueue::Buffer RootsBuffer;
+ BufferQueue::Buffer ShadowStackBuffer;
+ BufferQueue::Buffer NodeIdPairBuffer;
+ };
+
+ explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT {
+ new (&NodeAllocatorStorage)
+ NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size);
+ NodeAllocator =
+ reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+
+ new (&RootAllocatorStorage)
+ RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size);
+ RootAllocator =
+ reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+
+ new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(
+ B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size);
+ ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+ &ShadowStackAllocatorStorage);
+
+ new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(
+ B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size);
+ NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+ &NodeIdPairAllocatorStorage);
+ }
+
explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT {
new (&NodeAllocatorStorage) NodeAllocatorType(Max);
NodeAllocator =
@@ -283,6 +313,12 @@ public:
return A;
}
+ static Allocators
+ InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT {
+ Allocators A(Bufs);
+ return A;
+ }
+
private:
NodeArray Nodes;
RootArray Roots;
@@ -323,16 +359,27 @@ public:
void enterFunction(const int32_t FId, uint64_t TSC,
uint16_t CPU) XRAY_NEVER_INSTRUMENT {
DCHECK_NE(FId, 0);
- // This function primarily deals with ensuring that the ShadowStack is
- // consistent and ready for when an exit event is encountered.
+
+ // If we're already overflowed the function call stack, do not bother
+ // attempting to record any more function entries.
+ if (UNLIKELY(OverflowedFunctions)) {
+ ++OverflowedFunctions;
+ return;
+ }
+
+ // If this is the first function we've encountered, we want to set up the
+ // node(s) and treat it as a root.
if (UNLIKELY(ShadowStack.empty())) {
- auto NewRoot = Nodes.AppendEmplace(
- nullptr, NodeIdPairArray{*NodeIdPairAllocator}, 0u, 0u, FId);
+ auto *NewRoot = Nodes.AppendEmplace(
+ nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
if (UNLIKELY(NewRoot == nullptr))
return;
- if (Roots.Append(NewRoot) == nullptr)
+ if (Roots.AppendEmplace(NewRoot) == nullptr) {
+ Nodes.trim(1);
return;
+ }
if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) {
+ Nodes.trim(1);
Roots.trim(1);
++OverflowedFunctions;
return;
@@ -340,13 +387,14 @@ public:
return;
}
- auto &Top = ShadowStack.back();
- auto TopNode = Top.NodePtr;
+ // From this point on, we require that the stack is not empty.
+ DCHECK(!ShadowStack.empty());
+ auto TopNode = ShadowStack.back().NodePtr;
DCHECK_NE(TopNode, nullptr);
- // If we've seen this callee before, then we just access that node and place
- // that on the top of the stack.
- auto Callee = TopNode->Callees.find_element(
+ // If we've seen this callee before, then we access that node and place that
+ // on the top of the stack.
+ auto* Callee = TopNode->Callees.find_element(
[FId](const NodeIdPair &NR) { return NR.FId == FId; });
if (Callee != nullptr) {
CHECK_NE(Callee->NodePtr, nullptr);
@@ -356,7 +404,7 @@ public:
}
// This means we've never seen this stack before, create a new node here.
- auto NewNode = Nodes.AppendEmplace(
+ auto* NewNode = Nodes.AppendEmplace(
TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
if (UNLIKELY(NewNode == nullptr))
return;
@@ -364,7 +412,6 @@ public:
TopNode->Callees.AppendEmplace(NewNode, FId);
if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr)
++OverflowedFunctions;
- DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
return;
}
@@ -456,11 +503,13 @@ public:
if (UNLIKELY(NewRoot == nullptr))
return;
- O.Roots.Append(NewRoot);
+ if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr))
+ return;
// TODO: Figure out what to do if we fail to allocate any more stack
// space. Maybe warn or report once?
- DFSStack.AppendEmplace(Root, NewRoot);
+ if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr)
+ return;
while (!DFSStack.empty()) {
NodeAndParent NP = DFSStack.back();
DCHECK_NE(NP.Node, nullptr);
@@ -473,8 +522,12 @@ public:
Callee.FId);
if (UNLIKELY(NewNode == nullptr))
return;
- NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
- DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
+ if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) ==
+ nullptr))
+ return;
+ if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) ==
+ nullptr))
+ return;
}
}
}
diff --git a/compiler-rt/lib/xray/xray_profile_collector.cc b/compiler-rt/lib/xray/xray_profile_collector.cc
index 2ef3ebd940c..dc3a8206984 100644
--- a/compiler-rt/lib/xray/xray_profile_collector.cc
+++ b/compiler-rt/lib/xray/xray_profile_collector.cc
@@ -57,52 +57,90 @@ struct BlockHeader {
u64 ThreadId;
};
-using ThreadTriesArray = Array<ThreadTrie>;
+struct ThreadData {
+ BufferQueue *BQ;
+ FunctionCallTrie::Allocators::Buffers Buffers;
+ FunctionCallTrie::Allocators Allocators;
+ FunctionCallTrie FCT;
+ tid_t TId;
+};
+
+using ThreadDataArray = Array<ThreadData>;
+using ThreadDataAllocator = ThreadDataArray::AllocatorType;
+
+// We use a separate buffer queue for the backing store for the allocator used
+// by the ThreadData array. This lets us host the buffers, allocators, and tries
+// associated with a thread by moving the data into the array instead of
+// attempting to copy the data to a separately backed set of tries.
+static typename std::aligned_storage<
+ sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+static BufferQueue::Buffer Buffer;
+static typename std::aligned_storage<sizeof(ThreadDataAllocator),
+ alignof(ThreadDataAllocator)>::type
+ ThreadDataAllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadDataArray),
+ alignof(ThreadDataArray)>::type
+ ThreadDataArrayStorage;
+
+static ThreadDataAllocator *TDAllocator = nullptr;
+static ThreadDataArray *TDArray = nullptr;
+
using ProfileBufferArray = Array<ProfileBuffer>;
-using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType;
using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
// These need to be global aligned storage to avoid dynamic initialization. We
// need these to be aligned to allow us to placement new objects into the
// storage, and have pointers to those objects be appropriately aligned.
-static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
- AllocatorStorage;
-static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type
- ThreadTriesStorage;
static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
ProfileBuffersStorage;
-static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type
- ThreadTriesArrayAllocatorStorage;
static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
ProfileBufferArrayAllocatorStorage;
-static ThreadTriesArray *ThreadTries = nullptr;
-static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr;
-static ProfileBufferArray *ProfileBuffers = nullptr;
static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
-static FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+
+// Use a global flag to determine whether the collector implementation has been
+// initialized.
+static atomic_uint8_t CollectorInitialized{0};
} // namespace
-void post(const FunctionCallTrie &T, tid_t TId) XRAY_NEVER_INSTRUMENT {
- static pthread_once_t Once = PTHREAD_ONCE_INIT;
- pthread_once(
- &Once, +[]() XRAY_NEVER_INSTRUMENT { reset(); });
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+ FunctionCallTrie::Allocators &&A,
+ FunctionCallTrie::Allocators::Buffers &&B,
+ tid_t TId) XRAY_NEVER_INSTRUMENT {
+ DCHECK_NE(Q, nullptr);
+
+ // Bail out early if the collector has not been initialized.
+ if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
+ T.~FunctionCallTrie();
+ A.~Allocators();
+ Q->releaseBuffer(B.NodeBuffer);
+ Q->releaseBuffer(B.RootsBuffer);
+ Q->releaseBuffer(B.ShadowStackBuffer);
+ Q->releaseBuffer(B.NodeIdPairBuffer);
+ B.~Buffers();
+ return;
+ }
- ThreadTrie *Item = nullptr;
{
SpinMutexLock Lock(&GlobalMutex);
- if (GlobalAllocators == nullptr || ThreadTries == nullptr)
- return;
-
- Item = ThreadTries->Append({});
- if (Item == nullptr)
- return;
-
- Item->TId = TId;
- auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
- new (Trie) FunctionCallTrie(*GlobalAllocators);
- T.deepCopyInto(*Trie);
+ DCHECK_NE(TDAllocator, nullptr);
+ DCHECK_NE(TDArray, nullptr);
+
+ if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
+ TId) == nullptr) {
+ // If we fail to add the data to the array, we should destroy the objects
+ // handed us.
+ T.~FunctionCallTrie();
+ A.~Allocators();
+ Q->releaseBuffer(B.NodeBuffer);
+ Q->releaseBuffer(B.RootsBuffer);
+ Q->releaseBuffer(B.ShadowStackBuffer);
+ Q->releaseBuffer(B.NodeIdPairBuffer);
+ B.~Buffers();
+ }
}
}
@@ -133,11 +171,13 @@ populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
using StackAllocator = typename StackArray::AllocatorType;
StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
StackArray DFSStack(StackAlloc);
- for (const auto R : Trie.getRoots()) {
+ for (const auto *R : Trie.getRoots()) {
DFSStack.Append(R);
while (!DFSStack.empty()) {
- auto Node = DFSStack.back();
+ auto *Node = DFSStack.back();
DFSStack.trim(1);
+ if (Node == nullptr)
+ continue;
auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
if (Record == nullptr)
return;
@@ -191,40 +231,54 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
} // namespace
void serialize() XRAY_NEVER_INSTRUMENT {
- SpinMutexLock Lock(&GlobalMutex);
-
- if (GlobalAllocators == nullptr || ThreadTries == nullptr ||
- ProfileBuffers == nullptr)
+ if (!atomic_load(&CollectorInitialized, memory_order_acquire))
return;
+ SpinMutexLock Lock(&GlobalMutex);
+
// Clear out the global ProfileBuffers, if it's not empty.
for (auto &B : *ProfileBuffers)
deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
ProfileBuffers->trim(ProfileBuffers->size());
- if (ThreadTries->empty())
+ DCHECK_NE(TDArray, nullptr);
+ if (TDArray->empty())
return;
// Then repopulate the global ProfileBuffers.
u32 I = 0;
- for (const auto &ThreadTrie : *ThreadTries) {
+ auto MaxSize = profilingFlags()->global_allocator_max;
+ auto ProfileArena = allocateBuffer(MaxSize);
+ if (ProfileArena == nullptr)
+ return;
+
+ auto ProfileArenaCleanup = at_scope_exit(
+ [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
+
+ auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
+ if (PathArena == nullptr)
+ return;
+
+ auto PathArenaCleanup = at_scope_exit(
+ [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
+
+ for (const auto &ThreadTrie : *TDArray) {
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
- ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+ ProfileRecordAllocator PRAlloc(ProfileArena,
+ profilingFlags()->global_allocator_max);
ProfileRecord::PathAllocator PathAlloc(
- profilingFlags()->global_allocator_max);
+ PathArena, profilingFlags()->global_allocator_max);
ProfileRecordArray ProfileRecords(PRAlloc);
// First, we want to compute the amount of space we're going to need. We'll
// use a local allocator and an __xray::Array<...> to store the intermediary
// data, then compute the size as we're going along. Then we'll allocate the
// contiguous space to contain the thread buffer data.
- const auto &Trie =
- *reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage));
- if (Trie.getRoots().empty())
+ if (ThreadTrie.FCT.getRoots().empty())
continue;
- populateRecords(ProfileRecords, PathAlloc, Trie);
- DCHECK(!Trie.getRoots().empty());
+ populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
+ DCHECK(!ThreadTrie.FCT.getRoots().empty());
DCHECK(!ProfileRecords.empty());
// Go through each record, to compute the sizes.
@@ -241,15 +295,16 @@ void serialize() XRAY_NEVER_INSTRUMENT {
CumulativeSizes += 20 + (4 * Record.Path.size());
BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
- auto Buffer = ProfileBuffers->Append({});
- Buffer->Size = sizeof(Header) + CumulativeSizes;
- Buffer->Data = allocateBuffer(Buffer->Size);
- DCHECK_NE(Buffer->Data, nullptr);
- serializeRecords(Buffer, Header, ProfileRecords);
+ auto B = ProfileBuffers->Append({});
+ B->Size = sizeof(Header) + CumulativeSizes;
+ B->Data = allocateBuffer(B->Size);
+ DCHECK_NE(B->Data, nullptr);
+ serializeRecords(B, Header, ProfileRecords);
}
}
void reset() XRAY_NEVER_INSTRUMENT {
+ atomic_store(&CollectorInitialized, 0, memory_order_release);
SpinMutexLock Lock(&GlobalMutex);
if (ProfileBuffers != nullptr) {
@@ -257,46 +312,68 @@ void reset() XRAY_NEVER_INSTRUMENT {
for (auto &B : *ProfileBuffers)
deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
ProfileBuffers->trim(ProfileBuffers->size());
+ ProfileBuffers = nullptr;
}
- if (ThreadTries != nullptr) {
- // Clear out the function call tries per thread.
- for (auto &T : *ThreadTries) {
- auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage);
- Trie->~FunctionCallTrie();
+ if (TDArray != nullptr) {
+ // Release the resources as required.
+ for (auto &TD : *TDArray) {
+ TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
}
- ThreadTries->trim(ThreadTries->size());
+ // We don't bother destroying the array here because we've already
+ // potentially freed the backing store for the array. Instead we're going to
+ // reset the pointer to nullptr, and re-use the storage later instead
+ // (placement-new'ing into the storage as-is).
+ TDArray = nullptr;
}
- // Reset the global allocators.
- if (GlobalAllocators != nullptr)
- GlobalAllocators->~Allocators();
+ if (TDAllocator != nullptr) {
+ TDAllocator->~Allocator();
+ TDAllocator = nullptr;
+ }
- GlobalAllocators =
- reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage);
- new (GlobalAllocators)
- FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators());
+ if (Buffer.Data != nullptr) {
+ BQ->releaseBuffer(Buffer);
+ }
- if (ThreadTriesAllocator != nullptr)
- ThreadTriesAllocator->~ThreadTriesArrayAllocator();
+ if (BQ == nullptr) {
+ bool Success = false;
+ new (&BufferQueueStorage)
+ BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
+ if (!Success)
+ return;
+ BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+ } else {
+ BQ->finalize();
- ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>(
- &ThreadTriesArrayAllocatorStorage);
- new (ThreadTriesAllocator)
- ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max);
- ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage);
- new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator);
+ if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
+ BufferQueue::ErrorCode::Ok)
+ return;
+ }
- if (ProfileBuffersAllocator != nullptr)
- ProfileBuffersAllocator->~ProfileBufferArrayAllocator();
+ if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
+ return;
+ new (&ProfileBufferArrayAllocatorStorage)
+ ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
&ProfileBufferArrayAllocatorStorage);
- new (ProfileBuffersAllocator)
- ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+
+ new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
ProfileBuffers =
reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
- new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator);
+
+ new (&ThreadDataAllocatorStorage)
+ ThreadDataAllocator(Buffer.Data, Buffer.Size);
+ TDAllocator =
+ reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
+ new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
+ TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
+
+ atomic_store(&CollectorInitialized, 1, memory_order_release);
}
XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
diff --git a/compiler-rt/lib/xray/xray_profile_collector.h b/compiler-rt/lib/xray/xray_profile_collector.h
index 335043db952..86c4ce85379 100644
--- a/compiler-rt/lib/xray/xray_profile_collector.h
+++ b/compiler-rt/lib/xray/xray_profile_collector.h
@@ -33,27 +33,13 @@ namespace profileCollectorService {
/// Posts the FunctionCallTrie associated with a specific Thread ID. This
/// will:
///
-/// - Make a copy of the FunctionCallTrie and store that against the Thread
-/// ID. This will use the global allocator for the service-managed
-/// FunctionCallTrie instances.
-/// - Queue up a pointer to the FunctionCallTrie.
-/// - If the queue is long enough (longer than some arbitrary threshold) we
-/// then pre-calculate a single FunctionCallTrie for the whole process.
+/// Moves the collection of FunctionCallTrie, Allocators, and Buffers associated
+/// with a thread's data to the queue. This takes ownership of the memory
+/// associated with a thread, and manages those exclusively.
///
-///
-/// We are making a copy of the FunctionCallTrie because the intent is to have
-/// this function be called at thread exit, or soon after the profiling
-/// handler is finalized through the XRay APIs. By letting threads each
-/// process their own thread-local FunctionCallTrie instances, we're removing
-/// the need for synchronisation across threads while we're profiling.
-/// However, once we're done profiling, we can then collect copies of these
-/// FunctionCallTrie instances and pay the cost of the copy.
-///
-/// NOTE: In the future, if this turns out to be more costly than "moving" the
-/// FunctionCallTrie instances from the owning thread to the collector
-/// service, then we can change the implementation to do it this way (moving)
-/// instead.
-void post(const FunctionCallTrie &T, tid_t TId);
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+ FunctionCallTrie::Allocators &&A,
+ FunctionCallTrie::Allocators::Buffers &&B, tid_t TId);
/// The serialize will process all FunctionCallTrie instances in memory, and
/// turn those into specifically formatted blocks, each describing the
diff --git a/compiler-rt/lib/xray/xray_profiling.cc b/compiler-rt/lib/xray/xray_profiling.cc
index 6db4b6ff9a0..4323170cd1b 100644
--- a/compiler-rt/lib/xray/xray_profiling.cc
+++ b/compiler-rt/lib/xray/xray_profiling.cc
@@ -19,6 +19,7 @@
#include "sanitizer_common/sanitizer_flags.h"
#include "xray/xray_interface.h"
#include "xray/xray_log_interface.h"
+#include "xray_buffer_queue.h"
#include "xray_flags.h"
#include "xray_profile_collector.h"
#include "xray_profiling_flags.h"
@@ -46,6 +47,13 @@ struct ProfilingData {
static pthread_key_t ProfilingKey;
+// We use a global buffer queue, which gets initialized once at initialisation
+// time, and gets reset when profiling is "done".
+static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type
+ BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+
+thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers;
thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators),
alignof(FunctionCallTrie::Allocators)>::type
AllocatorsStorage;
@@ -81,17 +89,58 @@ static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT {
uptr Allocators = 0;
if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1,
memory_order_acq_rel)) {
- new (&AllocatorsStorage)
- FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators());
+ bool Success = false;
+ auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ atomic_store(&TLD.Allocators, 0, memory_order_release);
+ });
+
+ // Acquire a set of buffers for this thread.
+ if (BQ == nullptr)
+ return nullptr;
+
+ if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok)
+ return nullptr;
+ auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ BQ->releaseBuffer(ThreadBuffers.NodeBuffer);
+ });
+
+ if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok)
+ return nullptr;
+ auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ BQ->releaseBuffer(ThreadBuffers.RootsBuffer);
+ });
+
+ if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) !=
+ BufferQueue::ErrorCode::Ok)
+ return nullptr;
+ auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer);
+ });
+
+ if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) !=
+ BufferQueue::ErrorCode::Ok)
+ return nullptr;
+
+ Success = true;
+ new (&AllocatorsStorage) FunctionCallTrie::Allocators(
+ FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers));
Allocators = reinterpret_cast<uptr>(
reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage));
atomic_store(&TLD.Allocators, Allocators, memory_order_release);
}
+ if (Allocators == 1)
+ return nullptr;
+
uptr FCT = 0;
if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) {
- new (&FunctionCallTrieStorage) FunctionCallTrie(
- *reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators));
+ new (&FunctionCallTrieStorage)
+ FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>(
+ atomic_load_relaxed(&TLD.Allocators)));
FCT = reinterpret_cast<uptr>(
reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage));
atomic_store(&TLD.FCT, FCT, memory_order_release);
@@ -104,10 +153,6 @@ static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT {
}
static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
- RecursionGuard TLDInit(TLDInitGuard);
- if (!TLDInit)
- return;
-
auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel);
if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>(
&FunctionCallTrieStorage)))
@@ -125,7 +170,7 @@ static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT {
if (!TLDInit)
return;
- uptr P = atomic_load(&T.FCT, memory_order_acquire);
+ uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel);
if (P != reinterpret_cast<uptr>(
reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)))
return;
@@ -133,10 +178,21 @@ static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT {
auto FCT = reinterpret_cast<FunctionCallTrie *>(P);
DCHECK_NE(FCT, nullptr);
- if (!FCT->getRoots().empty())
- profileCollectorService::post(*FCT, GetTid());
+ uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel);
+ if (A !=
+ reinterpret_cast<uptr>(
+ reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+ return;
- cleanupTLD();
+ auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A);
+ DCHECK_NE(Allocators, nullptr);
+
+ // Always move the data into the profile collector.
+ profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators),
+ std::move(ThreadBuffers), GetTid());
+
+ // Re-initialize the ThreadBuffers object to a known "default" state.
+ ThreadBuffers = FunctionCallTrie::Allocators::Buffers{};
}
} // namespace
@@ -176,8 +232,6 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
return XRayLogFlushStatus::XRAY_LOG_FLUSHING;
}
- postCurrentThreadFCT(TLD);
-
// At this point, we'll create the file that will contain the profile, but
// only if the options say so.
if (!profilingFlags()->no_flush) {
@@ -205,14 +259,11 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
}
}
- // Clean up the current thread's TLD information as well.
- cleanupTLD();
-
profileCollectorService::reset();
atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
memory_order_release);
- atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+ atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
memory_order_release);
return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
@@ -272,6 +323,12 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
return static_cast<XRayLogInitStatus>(CurrentStatus);
}
+ // Mark then finalize the current generation of buffers. This allows us to let
+ // the threads currently holding onto new buffers still use them, but let the
+ // last reference do the memory cleanup.
+ DCHECK_NE(BQ, nullptr);
+ BQ->finalize();
+
// Wait a grace period to allow threads to see that we're finalizing.
SleepForMillis(profilingFlags()->grace_period_ms);
@@ -293,8 +350,8 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
}
XRayLogInitStatus
-profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax,
- void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+profilingLoggingInit(size_t, size_t, void *Options,
+ size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
RecursionGuard G(ReentranceGuard);
if (!G)
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
@@ -302,7 +359,7 @@ profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax,
s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
XRayLogInitStatus::XRAY_LOG_INITIALIZING,
- memory_order_release)) {
+ memory_order_acq_rel)) {
if (Verbosity())
Report("Cannot initialize already initialised profiling "
"implementation.\n");
@@ -331,6 +388,41 @@ profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax,
// We need to reset the profile data collection implementation now.
profileCollectorService::reset();
+ // Then also reset the buffer queue implementation.
+ if (BQ == nullptr) {
+ bool Success = false;
+ new (&BufferQueueStorage)
+ BufferQueue(profilingFlags()->per_thread_allocator_max,
+ profilingFlags()->buffers_max, Success);
+ if (!Success) {
+ if (Verbosity())
+ Report("Failed to initialize preallocated memory buffers!");
+ atomic_store(&ProfilerLogStatus,
+ XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+ memory_order_release);
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ }
+
+ // If we've succeded, set the global pointer to the initialised storage.
+ BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+ } else {
+ BQ->finalize();
+ auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max,
+ profilingFlags()->buffers_max);
+
+ if (InitStatus != BufferQueue::ErrorCode::Ok) {
+ if (Verbosity())
+ Report("Failed to initialize preallocated memory buffers; error: %s",
+ BufferQueue::getErrorString(InitStatus));
+ atomic_store(&ProfilerLogStatus,
+ XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+ memory_order_release);
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ }
+
+ DCHECK(!BQ->finalizing());
+ }
+
// We need to set up the exit handlers.
static pthread_once_t Once = PTHREAD_ONCE_INIT;
pthread_once(
diff --git a/compiler-rt/lib/xray/xray_profiling_flags.inc b/compiler-rt/lib/xray/xray_profiling_flags.inc
index e9230ae6418..ccd70860bf6 100644
--- a/compiler-rt/lib/xray/xray_profiling_flags.inc
+++ b/compiler-rt/lib/xray/xray_profiling_flags.inc
@@ -14,7 +14,7 @@
#error "Define XRAY_FLAG prior to including this file!"
#endif
-XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20,
+XRAY_FLAG(uptr, per_thread_allocator_max, 16384,
"Maximum size of any single per-thread allocator.")
XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
"Maximum size of the global allocator for profile storage.")
@@ -27,3 +27,6 @@ XRAY_FLAG(int, grace_period_ms, 1,
XRAY_FLAG(bool, no_flush, false,
"Set to true if we want the profiling implementation to not write "
"out files.")
+XRAY_FLAG(int, buffers_max, 128,
+ "The number of buffers to pre-allocate used by the profiling "
+ "implementation.")
diff --git a/compiler-rt/lib/xray/xray_segmented_array.h b/compiler-rt/lib/xray/xray_segmented_array.h
index d4feace381c..bc7e9379f63 100644
--- a/compiler-rt/lib/xray/xray_segmented_array.h
+++ b/compiler-rt/lib/xray/xray_segmented_array.h
@@ -372,7 +372,7 @@ public:
auto Base = &Tail->Data;
auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
DCHECK_LE(AlignedOffset + sizeof(T),
- reinterpret_cast<unsigned char *>(Tail) + SegmentSize);
+ reinterpret_cast<unsigned char *>(Base) + SegmentSize);
// In-place construct at Position.
new (AlignedOffset) T{std::forward<Args>(args)...};
OpenPOWER on IntegriCloud