diff options
-rw-r--r-- | compiler-rt/lib/xray/CMakeLists.txt | 2 | ||||
-rw-r--r-- | compiler-rt/lib/xray/tests/unit/allocator_test.cc | 22 | ||||
-rw-r--r-- | compiler-rt/lib/xray/tests/unit/profile_collector_test.cc | 49 | ||||
-rw-r--r-- | compiler-rt/lib/xray/tests/unit/segmented_array_test.cc | 38 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_allocator.h | 22 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_function_call_trie.h | 85 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_profile_collector.cc | 225 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_profile_collector.h | 26 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_profiling.cc | 134 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_profiling_flags.inc | 5 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_segmented_array.h | 2 |
11 files changed, 460 insertions, 150 deletions
diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt index 541e181afbd..0a86c52e620 100644 --- a/compiler-rt/lib/xray/CMakeLists.txt +++ b/compiler-rt/lib/xray/CMakeLists.txt @@ -2,6 +2,7 @@ # XRay runtime library implementation files. set(XRAY_SOURCES + xray_buffer_queue.cc xray_init.cc xray_flags.cc xray_interface.cc @@ -11,7 +12,6 @@ set(XRAY_SOURCES # Implementation files for all XRay modes. set(XRAY_FDR_MODE_SOURCES xray_fdr_flags.cc - xray_buffer_queue.cc xray_fdr_logging.cc) set(XRAY_BASIC_MODE_SOURCES diff --git a/compiler-rt/lib/xray/tests/unit/allocator_test.cc b/compiler-rt/lib/xray/tests/unit/allocator_test.cc index 0177798b069..1170741623c 100644 --- a/compiler-rt/lib/xray/tests/unit/allocator_test.cc +++ b/compiler-rt/lib/xray/tests/unit/allocator_test.cc @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "xray_allocator.h" +#include "xray_buffer_queue.h" #include "gtest/gtest.h" namespace __xray { @@ -56,5 +57,26 @@ TEST(AllocatorTest, AllocateBoundaries) { ASSERT_EQ(C, Expected); } +TEST(AllocatorTest, AllocateFromNonOwned) { + bool Success = false; + BufferQueue BQ(GetPageSizeCached(), 10, Success); + ASSERT_TRUE(Success); + BufferQueue::Buffer B; + ASSERT_EQ(BQ.getBuffer(B), BufferQueue::ErrorCode::Ok); + { + Allocator<sizeof(OddSizedData)> A(B.Data, B.Size); + + // Keep allocating until we hit a nullptr block. + unsigned C = 0; + auto Expected = + GetPageSizeCached() / RoundUpTo(sizeof(OddSizedData), kCacheLineSize); + for (auto B = A.Allocate(); B.Data != nullptr; B = A.Allocate(), ++C) + ; + + ASSERT_EQ(C, Expected); + } + ASSERT_EQ(BQ.releaseBuffer(B), BufferQueue::ErrorCode::Ok); +} + } // namespace } // namespace __xray diff --git a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc index f06b7027ee1..df786d46b9d 100644 --- a/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc +++ b/compiler-rt/lib/xray/tests/unit/profile_collector_test.cc @@ -110,24 +110,31 @@ std::tuple<Profile, const char *> ParseProfile(const char *P) { TEST(profileCollectorServiceTest, PostSerializeCollect) { profilingFlags()->setDefaults(); - // The most basic use-case (the one we actually only care about) is the one - // where we ensure that we can post FunctionCallTrie instances, which are then - // destroyed but serialized properly. - // - // First, we initialise a set of allocators in the local scope. This ensures - // that we're able to copy the contents of the FunctionCallTrie that uses - // the local allocators. - auto Allocators = FunctionCallTrie::InitAllocators(); + bool Success = false; + BufferQueue BQ(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max, Success); + ASSERT_EQ(Success, true); + FunctionCallTrie::Allocators::Buffers Buffers; + ASSERT_EQ(BQ.getBuffer(Buffers.NodeBuffer), BufferQueue::ErrorCode::Ok); + ASSERT_EQ(BQ.getBuffer(Buffers.RootsBuffer), BufferQueue::ErrorCode::Ok); + ASSERT_EQ(BQ.getBuffer(Buffers.ShadowStackBuffer), + BufferQueue::ErrorCode::Ok); + ASSERT_EQ(BQ.getBuffer(Buffers.NodeIdPairBuffer), BufferQueue::ErrorCode::Ok); + auto Allocators = FunctionCallTrie::InitAllocatorsFromBuffers(Buffers); FunctionCallTrie T(Allocators); - // Then, we populate the trie with some data. + // Populate the trie with some data. T.enterFunction(1, 1, 0); T.enterFunction(2, 2, 0); T.exitFunction(2, 3, 0); T.exitFunction(1, 4, 0); + // Reset the collector data structures. + profileCollectorService::reset(); + // Then we post the data to the global profile collector service. - profileCollectorService::post(T, 1); + profileCollectorService::post(&BQ, std::move(T), std::move(Allocators), + std::move(Buffers), 1); // Then we serialize the data. profileCollectorService::serialize(); @@ -174,7 +181,21 @@ TEST(profileCollectorServiceTest, PostSerializeCollect) { // profileCollectorService. This simulates what the threads being profiled would // be doing anyway, but through the XRay logging implementation. void threadProcessing() { - thread_local auto Allocators = FunctionCallTrie::InitAllocators(); + static bool Success = false; + static BufferQueue BQ(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max, Success); + thread_local FunctionCallTrie::Allocators::Buffers Buffers = [] { + FunctionCallTrie::Allocators::Buffers B; + BQ.getBuffer(B.NodeBuffer); + BQ.getBuffer(B.RootsBuffer); + BQ.getBuffer(B.ShadowStackBuffer); + BQ.getBuffer(B.NodeIdPairBuffer); + return B; + }(); + + thread_local auto Allocators = + FunctionCallTrie::InitAllocatorsFromBuffers(Buffers); + FunctionCallTrie T(Allocators); T.enterFunction(1, 1, 0); @@ -182,11 +203,15 @@ void threadProcessing() { T.exitFunction(2, 3, 0); T.exitFunction(1, 4, 0); - profileCollectorService::post(T, GetTid()); + profileCollectorService::post(&BQ, std::move(T), std::move(Allocators), + std::move(Buffers), GetTid()); } TEST(profileCollectorServiceTest, PostSerializeCollectMultipleThread) { profilingFlags()->setDefaults(); + + profileCollectorService::reset(); + std::thread t1(threadProcessing); std::thread t2(threadProcessing); diff --git a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc index 73120aafc8e..46aeb88f71b 100644 --- a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc +++ b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc @@ -2,6 +2,9 @@ #include "xray_segmented_array.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include <algorithm> +#include <numeric> +#include <vector> namespace __xray { namespace { @@ -307,5 +310,40 @@ TEST(SegmentedArrayTest, PlacementNewOnAlignedStorage) { } } +TEST(SegmentedArrayTest, ArrayOfPointersIteratorAccess) { + using PtrArray = Array<int *>; + PtrArray::AllocatorType Alloc(16384); + Array<int *> A(Alloc); + static constexpr size_t Count = 100; + std::vector<int> Integers(Count); + std::iota(Integers.begin(), Integers.end(), 0); + for (auto &I : Integers) + ASSERT_NE(A.Append(&I), nullptr); + int V = 0; + ASSERT_EQ(A.size(), Count); + for (auto P : A) { + ASSERT_NE(P, nullptr); + ASSERT_EQ(*P, V++); + } +} + +TEST(SegmentedArrayTest, ArrayOfPointersIteratorAccessExhaustion) { + using PtrArray = Array<int *>; + PtrArray::AllocatorType Alloc(4096); + Array<int *> A(Alloc); + static constexpr size_t Count = 1000; + std::vector<int> Integers(Count); + std::iota(Integers.begin(), Integers.end(), 0); + for (auto &I : Integers) + if (A.Append(&I) == nullptr) + break; + int V = 0; + ASSERT_LT(A.size(), Count); + for (auto P : A) { + ASSERT_NE(P, nullptr); + ASSERT_EQ(*P, V++); + } +} + } // namespace } // namespace __xray diff --git a/compiler-rt/lib/xray/xray_allocator.h b/compiler-rt/lib/xray/xray_allocator.h index 2ba937b4324..907c54542a5 100644 --- a/compiler-rt/lib/xray/xray_allocator.h +++ b/compiler-rt/lib/xray/xray_allocator.h @@ -175,6 +175,7 @@ private: unsigned char *BackingStore = nullptr; unsigned char *AlignedNextBlock = nullptr; size_t AllocatedBlocks = 0; + bool Owned; SpinMutex Mutex{}; void *Alloc() XRAY_NEVER_INSTRUMENT { @@ -209,14 +210,14 @@ private: 0); } - if ((AllocatedBlocks * Block::Size) >= MaxMemory) + if (((AllocatedBlocks + 1) * Block::Size) > MaxMemory) return nullptr; // Align the pointer we'd like to return to an appropriate alignment, then // advance the pointer from where to start allocations. void *Result = AlignedNextBlock; - AlignedNextBlock = reinterpret_cast<unsigned char *>( - reinterpret_cast<unsigned char *>(AlignedNextBlock) + N); + AlignedNextBlock = + reinterpret_cast<unsigned char *>(AlignedNextBlock) + Block::Size; ++AllocatedBlocks; return Result; } @@ -227,6 +228,15 @@ public: BackingStore(nullptr), AlignedNextBlock(nullptr), AllocatedBlocks(0), + Owned(true), + Mutex() {} + + explicit Allocator(void *P, size_t M) XRAY_NEVER_INSTRUMENT + : MaxMemory(M), + BackingStore(reinterpret_cast<unsigned char *>(P)), + AlignedNextBlock(reinterpret_cast<unsigned char *>(P)), + AllocatedBlocks(0), + Owned(false), Mutex() {} Allocator(const Allocator &) = delete; @@ -243,6 +253,8 @@ public: O.AlignedNextBlock = nullptr; AllocatedBlocks = O.AllocatedBlocks; O.AllocatedBlocks = 0; + Owned = O.Owned; + O.Owned = false; } Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT { @@ -258,13 +270,15 @@ public: O.AlignedNextBlock = nullptr; AllocatedBlocks = O.AllocatedBlocks; O.AllocatedBlocks = 0; + Owned = O.Owned; + O.Owned = false; return *this; } Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; } ~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT { - if (BackingStore != nullptr) { + if (Owned && BackingStore != nullptr) { deallocateBuffer(BackingStore, MaxMemory); } } diff --git a/compiler-rt/lib/xray/xray_function_call_trie.h b/compiler-rt/lib/xray/xray_function_call_trie.h index d70667b5a7f..d01ad20e3d7 100644 --- a/compiler-rt/lib/xray/xray_function_call_trie.h +++ b/compiler-rt/lib/xray/xray_function_call_trie.h @@ -15,6 +15,7 @@ #ifndef XRAY_FUNCTION_CALL_TRIE_H #define XRAY_FUNCTION_CALL_TRIE_H +#include "xray_buffer_queue.h" #include "xray_defs.h" #include "xray_profiling_flags.h" #include "xray_segmented_array.h" @@ -161,6 +162,35 @@ public: Allocators(const Allocators &) = delete; Allocators &operator=(const Allocators &) = delete; + struct Buffers { + BufferQueue::Buffer NodeBuffer; + BufferQueue::Buffer RootsBuffer; + BufferQueue::Buffer ShadowStackBuffer; + BufferQueue::Buffer NodeIdPairBuffer; + }; + + explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT { + new (&NodeAllocatorStorage) + NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size); + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + + new (&RootAllocatorStorage) + RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + + new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType( + B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + + new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType( + B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + } + explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT { new (&NodeAllocatorStorage) NodeAllocatorType(Max); NodeAllocator = @@ -283,6 +313,12 @@ public: return A; } + static Allocators + InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT { + Allocators A(Bufs); + return A; + } + private: NodeArray Nodes; RootArray Roots; @@ -323,16 +359,27 @@ public: void enterFunction(const int32_t FId, uint64_t TSC, uint16_t CPU) XRAY_NEVER_INSTRUMENT { DCHECK_NE(FId, 0); - // This function primarily deals with ensuring that the ShadowStack is - // consistent and ready for when an exit event is encountered. + + // If we're already overflowed the function call stack, do not bother + // attempting to record any more function entries. + if (UNLIKELY(OverflowedFunctions)) { + ++OverflowedFunctions; + return; + } + + // If this is the first function we've encountered, we want to set up the + // node(s) and treat it as a root. if (UNLIKELY(ShadowStack.empty())) { - auto NewRoot = Nodes.AppendEmplace( - nullptr, NodeIdPairArray{*NodeIdPairAllocator}, 0u, 0u, FId); + auto *NewRoot = Nodes.AppendEmplace( + nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewRoot == nullptr)) return; - if (Roots.Append(NewRoot) == nullptr) + if (Roots.AppendEmplace(NewRoot) == nullptr) { + Nodes.trim(1); return; + } if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) { + Nodes.trim(1); Roots.trim(1); ++OverflowedFunctions; return; @@ -340,13 +387,14 @@ public: return; } - auto &Top = ShadowStack.back(); - auto TopNode = Top.NodePtr; + // From this point on, we require that the stack is not empty. + DCHECK(!ShadowStack.empty()); + auto TopNode = ShadowStack.back().NodePtr; DCHECK_NE(TopNode, nullptr); - // If we've seen this callee before, then we just access that node and place - // that on the top of the stack. - auto Callee = TopNode->Callees.find_element( + // If we've seen this callee before, then we access that node and place that + // on the top of the stack. + auto* Callee = TopNode->Callees.find_element( [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); @@ -356,7 +404,7 @@ public: } // This means we've never seen this stack before, create a new node here. - auto NewNode = Nodes.AppendEmplace( + auto* NewNode = Nodes.AppendEmplace( TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewNode == nullptr)) return; @@ -364,7 +412,6 @@ public: TopNode->Callees.AppendEmplace(NewNode, FId); if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr) ++OverflowedFunctions; - DCHECK_NE(ShadowStack.back().NodePtr, nullptr); return; } @@ -456,11 +503,13 @@ public: if (UNLIKELY(NewRoot == nullptr)) return; - O.Roots.Append(NewRoot); + if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr)) + return; // TODO: Figure out what to do if we fail to allocate any more stack // space. Maybe warn or report once? - DFSStack.AppendEmplace(Root, NewRoot); + if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr) + return; while (!DFSStack.empty()) { NodeAndParent NP = DFSStack.back(); DCHECK_NE(NP.Node, nullptr); @@ -473,8 +522,12 @@ public: Callee.FId); if (UNLIKELY(NewNode == nullptr)) return; - NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); - DFSStack.AppendEmplace(Callee.NodePtr, NewNode); + if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) == + nullptr)) + return; + if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) == + nullptr)) + return; } } } diff --git a/compiler-rt/lib/xray/xray_profile_collector.cc b/compiler-rt/lib/xray/xray_profile_collector.cc index 2ef3ebd940c..dc3a8206984 100644 --- a/compiler-rt/lib/xray/xray_profile_collector.cc +++ b/compiler-rt/lib/xray/xray_profile_collector.cc @@ -57,52 +57,90 @@ struct BlockHeader { u64 ThreadId; }; -using ThreadTriesArray = Array<ThreadTrie>; +struct ThreadData { + BufferQueue *BQ; + FunctionCallTrie::Allocators::Buffers Buffers; + FunctionCallTrie::Allocators Allocators; + FunctionCallTrie FCT; + tid_t TId; +}; + +using ThreadDataArray = Array<ThreadData>; +using ThreadDataAllocator = ThreadDataArray::AllocatorType; + +// We use a separate buffer queue for the backing store for the allocator used +// by the ThreadData array. This lets us host the buffers, allocators, and tries +// associated with a thread by moving the data into the array instead of +// attempting to copy the data to a separately backed set of tries. +static typename std::aligned_storage< + sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; +static BufferQueue *BQ = nullptr; +static BufferQueue::Buffer Buffer; +static typename std::aligned_storage<sizeof(ThreadDataAllocator), + alignof(ThreadDataAllocator)>::type + ThreadDataAllocatorStorage; +static typename std::aligned_storage<sizeof(ThreadDataArray), + alignof(ThreadDataArray)>::type + ThreadDataArrayStorage; + +static ThreadDataAllocator *TDAllocator = nullptr; +static ThreadDataArray *TDArray = nullptr; + using ProfileBufferArray = Array<ProfileBuffer>; -using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType; using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; // These need to be global aligned storage to avoid dynamic initialization. We // need these to be aligned to allow us to placement new objects into the // storage, and have pointers to those objects be appropriately aligned. -static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type - AllocatorStorage; -static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type - ThreadTriesStorage; static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type ProfileBuffersStorage; -static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type - ThreadTriesArrayAllocatorStorage; static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type ProfileBufferArrayAllocatorStorage; -static ThreadTriesArray *ThreadTries = nullptr; -static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr; -static ProfileBufferArray *ProfileBuffers = nullptr; static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; -static FunctionCallTrie::Allocators *GlobalAllocators = nullptr; +static ProfileBufferArray *ProfileBuffers = nullptr; + +// Use a global flag to determine whether the collector implementation has been +// initialized. +static atomic_uint8_t CollectorInitialized{0}; } // namespace -void post(const FunctionCallTrie &T, tid_t TId) XRAY_NEVER_INSTRUMENT { - static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once( - &Once, +[]() XRAY_NEVER_INSTRUMENT { reset(); }); +void post(BufferQueue *Q, FunctionCallTrie &&T, + FunctionCallTrie::Allocators &&A, + FunctionCallTrie::Allocators::Buffers &&B, + tid_t TId) XRAY_NEVER_INSTRUMENT { + DCHECK_NE(Q, nullptr); + + // Bail out early if the collector has not been initialized. + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + return; + } - ThreadTrie *Item = nullptr; { SpinMutexLock Lock(&GlobalMutex); - if (GlobalAllocators == nullptr || ThreadTries == nullptr) - return; - - Item = ThreadTries->Append({}); - if (Item == nullptr) - return; - - Item->TId = TId; - auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage); - new (Trie) FunctionCallTrie(*GlobalAllocators); - T.deepCopyInto(*Trie); + DCHECK_NE(TDAllocator, nullptr); + DCHECK_NE(TDArray, nullptr); + + if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), + TId) == nullptr) { + // If we fail to add the data to the array, we should destroy the objects + // handed us. + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + } } } @@ -133,11 +171,13 @@ populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, using StackAllocator = typename StackArray::AllocatorType; StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); StackArray DFSStack(StackAlloc); - for (const auto R : Trie.getRoots()) { + for (const auto *R : Trie.getRoots()) { DFSStack.Append(R); while (!DFSStack.empty()) { - auto Node = DFSStack.back(); + auto *Node = DFSStack.back(); DFSStack.trim(1); + if (Node == nullptr) + continue; auto Record = PRs.AppendEmplace(PathArray{PA}, Node); if (Record == nullptr) return; @@ -191,40 +231,54 @@ static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, } // namespace void serialize() XRAY_NEVER_INSTRUMENT { - SpinMutexLock Lock(&GlobalMutex); - - if (GlobalAllocators == nullptr || ThreadTries == nullptr || - ProfileBuffers == nullptr) + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) return; + SpinMutexLock Lock(&GlobalMutex); + // Clear out the global ProfileBuffers, if it's not empty. for (auto &B : *ProfileBuffers) deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); - if (ThreadTries->empty()) + DCHECK_NE(TDArray, nullptr); + if (TDArray->empty()) return; // Then repopulate the global ProfileBuffers. u32 I = 0; - for (const auto &ThreadTrie : *ThreadTries) { + auto MaxSize = profilingFlags()->global_allocator_max; + auto ProfileArena = allocateBuffer(MaxSize); + if (ProfileArena == nullptr) + return; + + auto ProfileArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); + + auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); + if (PathArena == nullptr) + return; + + auto PathArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); + + for (const auto &ThreadTrie : *TDArray) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; - ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); + ProfileRecordAllocator PRAlloc(ProfileArena, + profilingFlags()->global_allocator_max); ProfileRecord::PathAllocator PathAlloc( - profilingFlags()->global_allocator_max); + PathArena, profilingFlags()->global_allocator_max); ProfileRecordArray ProfileRecords(PRAlloc); // First, we want to compute the amount of space we're going to need. We'll // use a local allocator and an __xray::Array<...> to store the intermediary // data, then compute the size as we're going along. Then we'll allocate the // contiguous space to contain the thread buffer data. - const auto &Trie = - *reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage)); - if (Trie.getRoots().empty()) + if (ThreadTrie.FCT.getRoots().empty()) continue; - populateRecords(ProfileRecords, PathAlloc, Trie); - DCHECK(!Trie.getRoots().empty()); + populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); + DCHECK(!ThreadTrie.FCT.getRoots().empty()); DCHECK(!ProfileRecords.empty()); // Go through each record, to compute the sizes. @@ -241,15 +295,16 @@ void serialize() XRAY_NEVER_INSTRUMENT { CumulativeSizes += 20 + (4 * Record.Path.size()); BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; - auto Buffer = ProfileBuffers->Append({}); - Buffer->Size = sizeof(Header) + CumulativeSizes; - Buffer->Data = allocateBuffer(Buffer->Size); - DCHECK_NE(Buffer->Data, nullptr); - serializeRecords(Buffer, Header, ProfileRecords); + auto B = ProfileBuffers->Append({}); + B->Size = sizeof(Header) + CumulativeSizes; + B->Data = allocateBuffer(B->Size); + DCHECK_NE(B->Data, nullptr); + serializeRecords(B, Header, ProfileRecords); } } void reset() XRAY_NEVER_INSTRUMENT { + atomic_store(&CollectorInitialized, 0, memory_order_release); SpinMutexLock Lock(&GlobalMutex); if (ProfileBuffers != nullptr) { @@ -257,46 +312,68 @@ void reset() XRAY_NEVER_INSTRUMENT { for (auto &B : *ProfileBuffers) deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); + ProfileBuffers = nullptr; } - if (ThreadTries != nullptr) { - // Clear out the function call tries per thread. - for (auto &T : *ThreadTries) { - auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage); - Trie->~FunctionCallTrie(); + if (TDArray != nullptr) { + // Release the resources as required. + for (auto &TD : *TDArray) { + TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); + TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); + TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); + TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); } - ThreadTries->trim(ThreadTries->size()); + // We don't bother destroying the array here because we've already + // potentially freed the backing store for the array. Instead we're going to + // reset the pointer to nullptr, and re-use the storage later instead + // (placement-new'ing into the storage as-is). + TDArray = nullptr; } - // Reset the global allocators. - if (GlobalAllocators != nullptr) - GlobalAllocators->~Allocators(); + if (TDAllocator != nullptr) { + TDAllocator->~Allocator(); + TDAllocator = nullptr; + } - GlobalAllocators = - reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage); - new (GlobalAllocators) - FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + if (Buffer.Data != nullptr) { + BQ->releaseBuffer(Buffer); + } - if (ThreadTriesAllocator != nullptr) - ThreadTriesAllocator->~ThreadTriesArrayAllocator(); + if (BQ == nullptr) { + bool Success = false; + new (&BufferQueueStorage) + BufferQueue(profilingFlags()->global_allocator_max, 1, Success); + if (!Success) + return; + BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); + } else { + BQ->finalize(); - ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>( - &ThreadTriesArrayAllocatorStorage); - new (ThreadTriesAllocator) - ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max); - ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage); - new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator); + if (BQ->init(profilingFlags()->global_allocator_max, 1) != + BufferQueue::ErrorCode::Ok) + return; + } - if (ProfileBuffersAllocator != nullptr) - ProfileBuffersAllocator->~ProfileBufferArrayAllocator(); + if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) + return; + new (&ProfileBufferArrayAllocatorStorage) + ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( &ProfileBufferArrayAllocatorStorage); - new (ProfileBuffersAllocator) - ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); + + new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); ProfileBuffers = reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); - new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator); + + new (&ThreadDataAllocatorStorage) + ThreadDataAllocator(Buffer.Data, Buffer.Size); + TDAllocator = + reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); + new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); + TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); + + atomic_store(&CollectorInitialized, 1, memory_order_release); } XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { diff --git a/compiler-rt/lib/xray/xray_profile_collector.h b/compiler-rt/lib/xray/xray_profile_collector.h index 335043db952..86c4ce85379 100644 --- a/compiler-rt/lib/xray/xray_profile_collector.h +++ b/compiler-rt/lib/xray/xray_profile_collector.h @@ -33,27 +33,13 @@ namespace profileCollectorService { /// Posts the FunctionCallTrie associated with a specific Thread ID. This /// will: /// -/// - Make a copy of the FunctionCallTrie and store that against the Thread -/// ID. This will use the global allocator for the service-managed -/// FunctionCallTrie instances. -/// - Queue up a pointer to the FunctionCallTrie. -/// - If the queue is long enough (longer than some arbitrary threshold) we -/// then pre-calculate a single FunctionCallTrie for the whole process. +/// Moves the collection of FunctionCallTrie, Allocators, and Buffers associated +/// with a thread's data to the queue. This takes ownership of the memory +/// associated with a thread, and manages those exclusively. /// -/// -/// We are making a copy of the FunctionCallTrie because the intent is to have -/// this function be called at thread exit, or soon after the profiling -/// handler is finalized through the XRay APIs. By letting threads each -/// process their own thread-local FunctionCallTrie instances, we're removing -/// the need for synchronisation across threads while we're profiling. -/// However, once we're done profiling, we can then collect copies of these -/// FunctionCallTrie instances and pay the cost of the copy. -/// -/// NOTE: In the future, if this turns out to be more costly than "moving" the -/// FunctionCallTrie instances from the owning thread to the collector -/// service, then we can change the implementation to do it this way (moving) -/// instead. -void post(const FunctionCallTrie &T, tid_t TId); +void post(BufferQueue *Q, FunctionCallTrie &&T, + FunctionCallTrie::Allocators &&A, + FunctionCallTrie::Allocators::Buffers &&B, tid_t TId); /// The serialize will process all FunctionCallTrie instances in memory, and /// turn those into specifically formatted blocks, each describing the diff --git a/compiler-rt/lib/xray/xray_profiling.cc b/compiler-rt/lib/xray/xray_profiling.cc index 6db4b6ff9a0..4323170cd1b 100644 --- a/compiler-rt/lib/xray/xray_profiling.cc +++ b/compiler-rt/lib/xray/xray_profiling.cc @@ -19,6 +19,7 @@ #include "sanitizer_common/sanitizer_flags.h" #include "xray/xray_interface.h" #include "xray/xray_log_interface.h" +#include "xray_buffer_queue.h" #include "xray_flags.h" #include "xray_profile_collector.h" #include "xray_profiling_flags.h" @@ -46,6 +47,13 @@ struct ProfilingData { static pthread_key_t ProfilingKey; +// We use a global buffer queue, which gets initialized once at initialisation +// time, and gets reset when profiling is "done". +static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type + BufferQueueStorage; +static BufferQueue *BQ = nullptr; + +thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers; thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators), alignof(FunctionCallTrie::Allocators)>::type AllocatorsStorage; @@ -81,17 +89,58 @@ static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT { uptr Allocators = 0; if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1, memory_order_acq_rel)) { - new (&AllocatorsStorage) - FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + bool Success = false; + auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + atomic_store(&TLD.Allocators, 0, memory_order_release); + }); + + // Acquire a set of buffers for this thread. + if (BQ == nullptr) + return nullptr; + + if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok) + return nullptr; + auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.NodeBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok) + return nullptr; + auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.RootsBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) != + BufferQueue::ErrorCode::Ok) + return nullptr; + auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) != + BufferQueue::ErrorCode::Ok) + return nullptr; + + Success = true; + new (&AllocatorsStorage) FunctionCallTrie::Allocators( + FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers)); Allocators = reinterpret_cast<uptr>( reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)); atomic_store(&TLD.Allocators, Allocators, memory_order_release); } + if (Allocators == 1) + return nullptr; + uptr FCT = 0; if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) { - new (&FunctionCallTrieStorage) FunctionCallTrie( - *reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)); + new (&FunctionCallTrieStorage) + FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>( + atomic_load_relaxed(&TLD.Allocators))); FCT = reinterpret_cast<uptr>( reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)); atomic_store(&TLD.FCT, FCT, memory_order_release); @@ -104,10 +153,6 @@ static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT { } static void cleanupTLD() XRAY_NEVER_INSTRUMENT { - RecursionGuard TLDInit(TLDInitGuard); - if (!TLDInit) - return; - auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel); if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>( &FunctionCallTrieStorage))) @@ -125,7 +170,7 @@ static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT { if (!TLDInit) return; - uptr P = atomic_load(&T.FCT, memory_order_acquire); + uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel); if (P != reinterpret_cast<uptr>( reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage))) return; @@ -133,10 +178,21 @@ static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT { auto FCT = reinterpret_cast<FunctionCallTrie *>(P); DCHECK_NE(FCT, nullptr); - if (!FCT->getRoots().empty()) - profileCollectorService::post(*FCT, GetTid()); + uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel); + if (A != + reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage))) + return; - cleanupTLD(); + auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A); + DCHECK_NE(Allocators, nullptr); + + // Always move the data into the profile collector. + profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators), + std::move(ThreadBuffers), GetTid()); + + // Re-initialize the ThreadBuffers object to a known "default" state. + ThreadBuffers = FunctionCallTrie::Allocators::Buffers{}; } } // namespace @@ -176,8 +232,6 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { return XRayLogFlushStatus::XRAY_LOG_FLUSHING; } - postCurrentThreadFCT(TLD); - // At this point, we'll create the file that will contain the profile, but // only if the options say so. if (!profilingFlags()->no_flush) { @@ -205,14 +259,11 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { } } - // Clean up the current thread's TLD information as well. - cleanupTLD(); - profileCollectorService::reset(); atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, memory_order_release); - atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, memory_order_release); return XRayLogFlushStatus::XRAY_LOG_FLUSHED; @@ -272,6 +323,12 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { return static_cast<XRayLogInitStatus>(CurrentStatus); } + // Mark then finalize the current generation of buffers. This allows us to let + // the threads currently holding onto new buffers still use them, but let the + // last reference do the memory cleanup. + DCHECK_NE(BQ, nullptr); + BQ->finalize(); + // Wait a grace period to allow threads to see that we're finalizing. SleepForMillis(profilingFlags()->grace_period_ms); @@ -293,8 +350,8 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { } XRayLogInitStatus -profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax, - void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { +profilingLoggingInit(size_t, size_t, void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { RecursionGuard G(ReentranceGuard); if (!G) return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; @@ -302,7 +359,7 @@ profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax, s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZING, - memory_order_release)) { + memory_order_acq_rel)) { if (Verbosity()) Report("Cannot initialize already initialised profiling " "implementation.\n"); @@ -331,6 +388,41 @@ profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax, // We need to reset the profile data collection implementation now. profileCollectorService::reset(); + // Then also reset the buffer queue implementation. + if (BQ == nullptr) { + bool Success = false; + new (&BufferQueueStorage) + BufferQueue(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max, Success); + if (!Success) { + if (Verbosity()) + Report("Failed to initialize preallocated memory buffers!"); + atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, + memory_order_release); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + // If we've succeded, set the global pointer to the initialised storage. + BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); + } else { + BQ->finalize(); + auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max); + + if (InitStatus != BufferQueue::ErrorCode::Ok) { + if (Verbosity()) + Report("Failed to initialize preallocated memory buffers; error: %s", + BufferQueue::getErrorString(InitStatus)); + atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, + memory_order_release); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + DCHECK(!BQ->finalizing()); + } + // We need to set up the exit handlers. static pthread_once_t Once = PTHREAD_ONCE_INIT; pthread_once( diff --git a/compiler-rt/lib/xray/xray_profiling_flags.inc b/compiler-rt/lib/xray/xray_profiling_flags.inc index e9230ae6418..ccd70860bf6 100644 --- a/compiler-rt/lib/xray/xray_profiling_flags.inc +++ b/compiler-rt/lib/xray/xray_profiling_flags.inc @@ -14,7 +14,7 @@ #error "Define XRAY_FLAG prior to including this file!" #endif -XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20, +XRAY_FLAG(uptr, per_thread_allocator_max, 16384, "Maximum size of any single per-thread allocator.") XRAY_FLAG(uptr, global_allocator_max, 2 << 24, "Maximum size of the global allocator for profile storage.") @@ -27,3 +27,6 @@ XRAY_FLAG(int, grace_period_ms, 1, XRAY_FLAG(bool, no_flush, false, "Set to true if we want the profiling implementation to not write " "out files.") +XRAY_FLAG(int, buffers_max, 128, + "The number of buffers to pre-allocate used by the profiling " + "implementation.") diff --git a/compiler-rt/lib/xray/xray_segmented_array.h b/compiler-rt/lib/xray/xray_segmented_array.h index d4feace381c..bc7e9379f63 100644 --- a/compiler-rt/lib/xray/xray_segmented_array.h +++ b/compiler-rt/lib/xray/xray_segmented_array.h @@ -372,7 +372,7 @@ public: auto Base = &Tail->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); DCHECK_LE(AlignedOffset + sizeof(T), - reinterpret_cast<unsigned char *>(Tail) + SegmentSize); + reinterpret_cast<unsigned char *>(Base) + SegmentSize); // In-place construct at Position. new (AlignedOffset) T{std::forward<Args>(args)...}; |