diff options
author | Dean Michael Berris <dberris@google.com> | 2018-12-07 03:19:13 +0000 |
---|---|---|
committer | Dean Michael Berris <dberris@google.com> | 2018-12-07 03:19:13 +0000 |
commit | 190c49bc8fd73dab0412dce3b85f50949184d5b5 (patch) | |
tree | 48d96905bb1adae78bab4a37f0e28946d242769a /compiler-rt/lib | |
parent | b2a6f8e505b105c733d931e1ed0513d7ea94f6c8 (diff) | |
download | bcm5719-llvm-190c49bc8fd73dab0412dce3b85f50949184d5b5.tar.gz bcm5719-llvm-190c49bc8fd73dab0412dce3b85f50949184d5b5.zip |
Re-land "[XRay] Move-only Allocator, FunctionCallTrie, and Array"
This reverts commit r348455, with some additional changes:
- Work-around deficiency of gcc-4.8 by duplicating the implementation of
`AppendEmplace` in `Append`, but instead of using brace-init for the
copy construction, use a placement new explicitly calling the copy
constructor.
llvm-svn: 348563
Diffstat (limited to 'compiler-rt/lib')
-rw-r--r-- | compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc | 30 | ||||
-rw-r--r-- | compiler-rt/lib/xray/tests/unit/segmented_array_test.cc | 86 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_allocator.h | 74 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_function_call_trie.h | 262 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_profile_collector.cc | 32 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_profiling.cc | 293 | ||||
-rw-r--r-- | compiler-rt/lib/xray/xray_segmented_array.h | 531 |
7 files changed, 924 insertions, 384 deletions
diff --git a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc index 9b0f21090fb..01be691228f 100644 --- a/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc +++ b/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc @@ -309,6 +309,36 @@ TEST(FunctionCallTrieTest, MergeInto) { EXPECT_EQ(F2.Callees.size(), 0u); } +TEST(FunctionCallTrieTest, PlacementNewOnAlignedStorage) { + profilingFlags()->setDefaults(); + typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators), + alignof(FunctionCallTrie::Allocators)>::type + AllocatorsStorage; + new (&AllocatorsStorage) + FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + auto *A = + reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage); + + typename std::aligned_storage<sizeof(FunctionCallTrie), + alignof(FunctionCallTrie)>::type FCTStorage; + new (&FCTStorage) FunctionCallTrie(*A); + auto *T = reinterpret_cast<FunctionCallTrie *>(&FCTStorage); + + // Put some data into it. + T->enterFunction(1, 0, 0); + T->exitFunction(1, 1, 0); + + // Re-initialize the objects in storage. + T->~FunctionCallTrie(); + A->~Allocators(); + new (A) FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + new (T) FunctionCallTrie(*A); + + // Then put some data into it again. + T->enterFunction(1, 0, 0); + T->exitFunction(1, 1, 0); +} + } // namespace } // namespace __xray diff --git a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc index 80991b1b97a..73120aafc8e 100644 --- a/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc +++ b/compiler-rt/lib/xray/tests/unit/segmented_array_test.cc @@ -221,5 +221,91 @@ TEST(SegmentedArrayTest, SimulateStackBehaviour) { } } +TEST(SegmentedArrayTest, PlacementNewOnAlignedStorage) { + using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType; + typename std::aligned_storage<sizeof(AllocatorType), + alignof(AllocatorType)>::type AllocatorStorage; + new (&AllocatorStorage) AllocatorType(1 << 10); + auto *A = reinterpret_cast<AllocatorType *>(&AllocatorStorage); + typename std::aligned_storage<sizeof(Array<ShadowStackEntry>), + alignof(Array<ShadowStackEntry>)>::type + ArrayStorage; + new (&ArrayStorage) Array<ShadowStackEntry>(*A); + auto *Data = reinterpret_cast<Array<ShadowStackEntry> *>(&ArrayStorage); + + static uint64_t Dummy = 0; + constexpr uint64_t Max = 9; + + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // Simulate a stack by checking the data from the end as we're trimming. + auto Counter = Max; + ASSERT_EQ(Data->size(), size_t(Max)); + while (!Data->empty()) { + const auto &Top = Data->back(); + uint64_t *TopNode = Top.NodePtr; + EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter; + Data->trim(1); + --Counter; + ASSERT_EQ(Data->size(), size_t(Counter)); + } + + // Once the stack is exhausted, we re-use the storage. + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // We re-initialize the storage, by calling the destructor and + // placement-new'ing again. + Data->~Array(); + A->~AllocatorType(); + new (A) AllocatorType(1 << 10); + new (Data) Array<ShadowStackEntry>(*A); + + // Then re-do the test. + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } + + // Simulate a stack by checking the data from the end as we're trimming. + Counter = Max; + ASSERT_EQ(Data->size(), size_t(Max)); + while (!Data->empty()) { + const auto &Top = Data->back(); + uint64_t *TopNode = Top.NodePtr; + EXPECT_EQ(TopNode, &Dummy) << "Counter = " << Counter; + Data->trim(1); + --Counter; + ASSERT_EQ(Data->size(), size_t(Counter)); + } + + // Once the stack is exhausted, we re-use the storage. + for (uint64_t i = 0; i < Max; ++i) { + auto P = Data->Append({i, &Dummy}); + ASSERT_NE(P, nullptr); + ASSERT_EQ(P->NodePtr, &Dummy); + auto &Back = Data->back(); + ASSERT_EQ(Back.NodePtr, &Dummy); + ASSERT_EQ(Back.EntryTSC, i); + } +} + } // namespace } // namespace __xray diff --git a/compiler-rt/lib/xray/xray_allocator.h b/compiler-rt/lib/xray/xray_allocator.h index 0abce3c6da0..2ba937b4324 100644 --- a/compiler-rt/lib/xray/xray_allocator.h +++ b/compiler-rt/lib/xray/xray_allocator.h @@ -21,8 +21,8 @@ #include "sanitizer_common/sanitizer_mutex.h" #if SANITIZER_FUCHSIA #include <zircon/process.h> -#include <zircon/syscalls.h> #include <zircon/status.h> +#include <zircon/syscalls.h> #else #include "sanitizer_common/sanitizer_posix.h" #endif @@ -50,20 +50,20 @@ template <class T> T *allocate() XRAY_NEVER_INSTRUMENT { } uintptr_t B; Status = - _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, - Vmo, 0, sizeof(T), &B); + _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, + Vmo, 0, sizeof(T), &B); _zx_handle_close(Vmo); if (Status != ZX_OK) { if (Verbosity()) - Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", - sizeof(T), _zx_status_get_string(Status)); + Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", sizeof(T), + _zx_status_get_string(Status)); return nullptr; } return reinterpret_cast<T *>(B); #else uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - int ErrNo; + int ErrNo = 0; if (UNLIKELY(internal_iserror(B, &ErrNo))) { if (Verbosity()) Report( @@ -80,8 +80,8 @@ template <class T> void deallocate(T *B) XRAY_NEVER_INSTRUMENT { return; uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached()); #if SANITIZER_FUCHSIA - _zx_vmar_unmap(_zx_vmar_root_self(), - reinterpret_cast<uintptr_t>(B), RoundedSize); + _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B), + RoundedSize); #else internal_munmap(B, RoundedSize); #endif @@ -95,25 +95,24 @@ T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT { zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo); if (Status != ZX_OK) { if (Verbosity()) - Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", - S, _zx_status_get_string(Status)); + Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", S, + _zx_status_get_string(Status)); return nullptr; } uintptr_t B; - Status = - _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, - Vmo, 0, S, &B); + Status = _zx_vmar_map(_zx_vmar_root_self(), + ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, Vmo, 0, S, &B); _zx_handle_close(Vmo); if (Status != ZX_OK) { if (Verbosity()) - Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", - S, _zx_status_get_string(Status)); + Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", S, + _zx_status_get_string(Status)); return nullptr; } #else uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - int ErrNo; + int ErrNo = 0; if (UNLIKELY(internal_iserror(B, &ErrNo))) { if (Verbosity()) Report( @@ -130,7 +129,8 @@ template <class T> void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT { return; uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached()); #if SANITIZER_FUCHSIA - _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B), RoundedSize); + _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B), + RoundedSize); #else internal_munmap(B, RoundedSize); #endif @@ -171,7 +171,7 @@ template <size_t N> struct Allocator { }; private: - const size_t MaxMemory{0}; + size_t MaxMemory{0}; unsigned char *BackingStore = nullptr; unsigned char *AlignedNextBlock = nullptr; size_t AllocatedBlocks = 0; @@ -223,7 +223,43 @@ private: public: explicit Allocator(size_t M) XRAY_NEVER_INSTRUMENT - : MaxMemory(RoundUpTo(M, kCacheLineSize)) {} + : MaxMemory(RoundUpTo(M, kCacheLineSize)), + BackingStore(nullptr), + AlignedNextBlock(nullptr), + AllocatedBlocks(0), + Mutex() {} + + Allocator(const Allocator &) = delete; + Allocator &operator=(const Allocator &) = delete; + + Allocator(Allocator &&O) XRAY_NEVER_INSTRUMENT { + SpinMutexLock L0(&Mutex); + SpinMutexLock L1(&O.Mutex); + MaxMemory = O.MaxMemory; + O.MaxMemory = 0; + BackingStore = O.BackingStore; + O.BackingStore = nullptr; + AlignedNextBlock = O.AlignedNextBlock; + O.AlignedNextBlock = nullptr; + AllocatedBlocks = O.AllocatedBlocks; + O.AllocatedBlocks = 0; + } + + Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT { + SpinMutexLock L0(&Mutex); + SpinMutexLock L1(&O.Mutex); + MaxMemory = O.MaxMemory; + O.MaxMemory = 0; + if (BackingStore != nullptr) + deallocateBuffer(BackingStore, MaxMemory); + BackingStore = O.BackingStore; + O.BackingStore = nullptr; + AlignedNextBlock = O.AlignedNextBlock; + O.AlignedNextBlock = nullptr; + AllocatedBlocks = O.AllocatedBlocks; + O.AllocatedBlocks = 0; + return *this; + } Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; } diff --git a/compiler-rt/lib/xray/xray_function_call_trie.h b/compiler-rt/lib/xray/xray_function_call_trie.h index 4a6a94ca9ea..d70667b5a7f 100644 --- a/compiler-rt/lib/xray/xray_function_call_trie.h +++ b/compiler-rt/lib/xray/xray_function_call_trie.h @@ -98,9 +98,6 @@ public: struct NodeIdPair { Node *NodePtr; int32_t FId; - - // Constructor for inplace-construction. - NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {} }; using NodeIdPairArray = Array<NodeIdPair>; @@ -118,15 +115,6 @@ public: uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. int32_t FId; - // We add a constructor here to allow us to inplace-construct through - // Array<...>'s AppendEmplace. - Node(Node *P, NodeIdPairAllocatorType &A, uint64_t CC, uint64_t CLT, - int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P), - Callees(A), - CallCount(CC), - CumulativeLocalTime(CLT), - FId(F) {} - // TODO: Include the compact histogram. }; @@ -135,13 +123,6 @@ private: uint64_t EntryTSC; Node *NodePtr; uint16_t EntryCPU; - - // We add a constructor here to allow us to inplace-construct through - // Array<...>'s AppendEmplace. - ShadowStackEntry(uint64_t T, Node *N, uint16_t C) XRAY_NEVER_INSTRUMENT - : EntryTSC{T}, - NodePtr{N}, - EntryCPU{C} {} }; using NodeArray = Array<Node>; @@ -156,20 +137,71 @@ public: using RootAllocatorType = RootArray::AllocatorType; using ShadowStackAllocatorType = ShadowStackArray::AllocatorType; + // Use hosted aligned storage members to allow for trivial move and init. + // This also allows us to sidestep the potential-failing allocation issue. + typename std::aligned_storage<sizeof(NodeAllocatorType), + alignof(NodeAllocatorType)>::type + NodeAllocatorStorage; + typename std::aligned_storage<sizeof(RootAllocatorType), + alignof(RootAllocatorType)>::type + RootAllocatorStorage; + typename std::aligned_storage<sizeof(ShadowStackAllocatorType), + alignof(ShadowStackAllocatorType)>::type + ShadowStackAllocatorStorage; + typename std::aligned_storage<sizeof(NodeIdPairAllocatorType), + alignof(NodeIdPairAllocatorType)>::type + NodeIdPairAllocatorStorage; + NodeAllocatorType *NodeAllocator = nullptr; RootAllocatorType *RootAllocator = nullptr; ShadowStackAllocatorType *ShadowStackAllocator = nullptr; NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; - Allocators() {} + Allocators() = default; Allocators(const Allocators &) = delete; Allocators &operator=(const Allocators &) = delete; - Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT - : NodeAllocator(O.NodeAllocator), - RootAllocator(O.RootAllocator), - ShadowStackAllocator(O.ShadowStackAllocator), - NodeIdPairAllocator(O.NodeIdPairAllocator) { + explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT { + new (&NodeAllocatorStorage) NodeAllocatorType(Max); + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + + new (&RootAllocatorStorage) RootAllocatorType(Max); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + + new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + + new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + } + + Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT { + // Here we rely on the safety of memcpy'ing contents of the storage + // members, and then pointing the source pointers to nullptr. + internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage, + sizeof(NodeAllocatorType)); + internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage, + sizeof(RootAllocatorType)); + internal_memcpy(&ShadowStackAllocatorStorage, + &O.ShadowStackAllocatorStorage, + sizeof(ShadowStackAllocatorType)); + internal_memcpy(&NodeIdPairAllocatorStorage, + &O.NodeIdPairAllocatorStorage, + sizeof(NodeIdPairAllocatorType)); + + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + O.NodeAllocator = nullptr; O.RootAllocator = nullptr; O.ShadowStackAllocator = nullptr; @@ -177,79 +209,77 @@ public: } Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT { - { - auto Tmp = O.NodeAllocator; - O.NodeAllocator = this->NodeAllocator; - this->NodeAllocator = Tmp; - } - { - auto Tmp = O.RootAllocator; - O.RootAllocator = this->RootAllocator; - this->RootAllocator = Tmp; - } - { - auto Tmp = O.ShadowStackAllocator; - O.ShadowStackAllocator = this->ShadowStackAllocator; - this->ShadowStackAllocator = Tmp; - } - { - auto Tmp = O.NodeIdPairAllocator; - O.NodeIdPairAllocator = this->NodeIdPairAllocator; - this->NodeIdPairAllocator = Tmp; - } - return *this; - } - - ~Allocators() XRAY_NEVER_INSTRUMENT { - // Note that we cannot use delete on these pointers, as they need to be - // returned to the sanitizer_common library's internal memory tracking - // system. - if (NodeAllocator != nullptr) { + // When moving into an existing instance, we ensure that we clean up the + // current allocators. + if (NodeAllocator) NodeAllocator->~NodeAllocatorType(); - deallocate(NodeAllocator); + if (O.NodeAllocator) { + new (&NodeAllocatorStorage) + NodeAllocatorType(std::move(*O.NodeAllocator)); + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + O.NodeAllocator = nullptr; + } else { NodeAllocator = nullptr; } - if (RootAllocator != nullptr) { + + if (RootAllocator) RootAllocator->~RootAllocatorType(); - deallocate(RootAllocator); + if (O.RootAllocator) { + new (&RootAllocatorStorage) + RootAllocatorType(std::move(*O.RootAllocator)); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + O.RootAllocator = nullptr; + } else { RootAllocator = nullptr; } - if (ShadowStackAllocator != nullptr) { + + if (ShadowStackAllocator) ShadowStackAllocator->~ShadowStackAllocatorType(); - deallocate(ShadowStackAllocator); + if (O.ShadowStackAllocator) { + new (&ShadowStackAllocatorStorage) + ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator)); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + O.ShadowStackAllocator = nullptr; + } else { ShadowStackAllocator = nullptr; } - if (NodeIdPairAllocator != nullptr) { + + if (NodeIdPairAllocator) NodeIdPairAllocator->~NodeIdPairAllocatorType(); - deallocate(NodeIdPairAllocator); + if (O.NodeIdPairAllocator) { + new (&NodeIdPairAllocatorStorage) + NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator)); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + O.NodeIdPairAllocator = nullptr; + } else { NodeIdPairAllocator = nullptr; } + + return *this; + } + + ~Allocators() XRAY_NEVER_INSTRUMENT { + if (NodeAllocator != nullptr) + NodeAllocator->~NodeAllocatorType(); + if (RootAllocator != nullptr) + RootAllocator->~RootAllocatorType(); + if (ShadowStackAllocator != nullptr) + ShadowStackAllocator->~ShadowStackAllocatorType(); + if (NodeIdPairAllocator != nullptr) + NodeIdPairAllocator->~NodeIdPairAllocatorType(); } }; - // TODO: Support configuration of options through the arguments. static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT { return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max); } static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT { - Allocators A; - auto NodeAllocator = allocate<Allocators::NodeAllocatorType>(); - new (NodeAllocator) Allocators::NodeAllocatorType(Max); - A.NodeAllocator = NodeAllocator; - - auto RootAllocator = allocate<Allocators::RootAllocatorType>(); - new (RootAllocator) Allocators::RootAllocatorType(Max); - A.RootAllocator = RootAllocator; - - auto ShadowStackAllocator = - allocate<Allocators::ShadowStackAllocatorType>(); - new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max); - A.ShadowStackAllocator = ShadowStackAllocator; - - auto NodeIdPairAllocator = allocate<NodeIdPairAllocatorType>(); - new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max); - A.NodeIdPairAllocator = NodeIdPairAllocator; + Allocators A(Max); return A; } @@ -257,14 +287,38 @@ private: NodeArray Nodes; RootArray Roots; ShadowStackArray ShadowStack; - NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + NodeIdPairAllocatorType *NodeIdPairAllocator; + uint32_t OverflowedFunctions; public: explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), ShadowStack(*A.ShadowStackAllocator), - NodeIdPairAllocator(A.NodeIdPairAllocator) {} + NodeIdPairAllocator(A.NodeIdPairAllocator), + OverflowedFunctions(0) {} + + FunctionCallTrie() = delete; + FunctionCallTrie(const FunctionCallTrie &) = delete; + FunctionCallTrie &operator=(const FunctionCallTrie &) = delete; + + FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT + : Nodes(std::move(O.Nodes)), + Roots(std::move(O.Roots)), + ShadowStack(std::move(O.ShadowStack)), + NodeIdPairAllocator(O.NodeIdPairAllocator), + OverflowedFunctions(O.OverflowedFunctions) {} + + FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT { + Nodes = std::move(O.Nodes); + Roots = std::move(O.Roots); + ShadowStack = std::move(O.ShadowStack); + NodeIdPairAllocator = O.NodeIdPairAllocator; + OverflowedFunctions = O.OverflowedFunctions; + return *this; + } + + ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {} void enterFunction(const int32_t FId, uint64_t TSC, uint16_t CPU) XRAY_NEVER_INSTRUMENT { @@ -272,12 +326,17 @@ public: // This function primarily deals with ensuring that the ShadowStack is // consistent and ready for when an exit event is encountered. if (UNLIKELY(ShadowStack.empty())) { - auto NewRoot = - Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0u, 0u, FId); + auto NewRoot = Nodes.AppendEmplace( + nullptr, NodeIdPairArray{*NodeIdPairAllocator}, 0u, 0u, FId); if (UNLIKELY(NewRoot == nullptr)) return; - Roots.Append(NewRoot); - ShadowStack.AppendEmplace(TSC, NewRoot, CPU); + if (Roots.Append(NewRoot) == nullptr) + return; + if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) { + Roots.trim(1); + ++OverflowedFunctions; + return; + } return; } @@ -291,29 +350,39 @@ public: [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); - ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU); + if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr) + ++OverflowedFunctions; return; } // This means we've never seen this stack before, create a new node here. - auto NewNode = - Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0u, 0u, FId); + auto NewNode = Nodes.AppendEmplace( + TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewNode == nullptr)) return; DCHECK_NE(NewNode, nullptr); TopNode->Callees.AppendEmplace(NewNode, FId); - ShadowStack.AppendEmplace(TSC, NewNode, CPU); + if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr) + ++OverflowedFunctions; DCHECK_NE(ShadowStack.back().NodePtr, nullptr); return; } void exitFunction(int32_t FId, uint64_t TSC, uint16_t CPU) XRAY_NEVER_INSTRUMENT { + // If we're exiting functions that have "overflowed" or don't fit into the + // stack due to allocator constraints, we then decrement that count first. + if (OverflowedFunctions) { + --OverflowedFunctions; + return; + } + // When we exit a function, we look up the ShadowStack to see whether we've // entered this function before. We do as little processing here as we can, // since most of the hard work would have already been done at function // entry. uint64_t CumulativeTreeTime = 0; + while (!ShadowStack.empty()) { const auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; @@ -380,7 +449,7 @@ public: for (const auto Root : getRoots()) { // Add a node in O for this root. auto NewRoot = O.Nodes.AppendEmplace( - nullptr, *O.NodeIdPairAllocator, Root->CallCount, + nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount, Root->CumulativeLocalTime, Root->FId); // Because we cannot allocate more memory we should bail out right away. @@ -399,8 +468,9 @@ public: DFSStack.trim(1); for (const auto Callee : NP.Node->Callees) { auto NewNode = O.Nodes.AppendEmplace( - NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, - Callee.NodePtr->CumulativeLocalTime, Callee.FId); + NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator), + Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime, + Callee.FId); if (UNLIKELY(NewNode == nullptr)) return; NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); @@ -433,8 +503,9 @@ public: auto R = O.Roots.find_element( [&](const Node *Node) { return Node->FId == Root->FId; }); if (R == nullptr) { - TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0u, - 0u, Root->FId); + TargetRoot = O.Nodes.AppendEmplace( + nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u, + Root->FId); if (UNLIKELY(TargetRoot == nullptr)) return; @@ -443,7 +514,7 @@ public: TargetRoot = *R; } - DFSStack.Append(NodeAndTarget{Root, TargetRoot}); + DFSStack.AppendEmplace(Root, TargetRoot); while (!DFSStack.empty()) { NodeAndTarget NT = DFSStack.back(); DCHECK_NE(NT.OrigNode, nullptr); @@ -459,7 +530,8 @@ public: }); if (TargetCallee == nullptr) { auto NewTargetNode = O.Nodes.AppendEmplace( - NT.TargetNode, *O.NodeIdPairAllocator, 0u, 0u, Callee.FId); + NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u, + Callee.FId); if (UNLIKELY(NewTargetNode == nullptr)) return; diff --git a/compiler-rt/lib/xray/xray_profile_collector.cc b/compiler-rt/lib/xray/xray_profile_collector.cc index a2a8f1ffe6a..2ef3ebd940c 100644 --- a/compiler-rt/lib/xray/xray_profile_collector.cc +++ b/compiler-rt/lib/xray/xray_profile_collector.cc @@ -86,7 +86,8 @@ static FunctionCallTrie::Allocators *GlobalAllocators = nullptr; void post(const FunctionCallTrie &T, tid_t TId) XRAY_NEVER_INSTRUMENT { static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { reset(); }); + pthread_once( + &Once, +[]() XRAY_NEVER_INSTRUMENT { reset(); }); ThreadTrie *Item = nullptr; { @@ -95,13 +96,14 @@ void post(const FunctionCallTrie &T, tid_t TId) XRAY_NEVER_INSTRUMENT { return; Item = ThreadTries->Append({}); + if (Item == nullptr) + return; + Item->TId = TId; auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage); new (Trie) FunctionCallTrie(*GlobalAllocators); + T.deepCopyInto(*Trie); } - - auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage); - T.deepCopyInto(*Trie); } // A PathArray represents the function id's representing a stack trace. In this @@ -115,13 +117,7 @@ struct ProfileRecord { // The Path in this record is the function id's from the leaf to the root of // the function call stack as represented from a FunctionCallTrie. PathArray Path; - const FunctionCallTrie::Node *Node = nullptr; - - // Constructor for in-place construction. - ProfileRecord(PathAllocator &A, - const FunctionCallTrie::Node *N) XRAY_NEVER_INSTRUMENT - : Path(A), - Node(N) {} + const FunctionCallTrie::Node *Node; }; namespace { @@ -142,7 +138,7 @@ populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, while (!DFSStack.empty()) { auto Node = DFSStack.back(); DFSStack.trim(1); - auto Record = PRs.AppendEmplace(PA, Node); + auto Record = PRs.AppendEmplace(PathArray{PA}, Node); if (Record == nullptr) return; DCHECK_NE(Record, nullptr); @@ -203,7 +199,7 @@ void serialize() XRAY_NEVER_INSTRUMENT { // Clear out the global ProfileBuffers, if it's not empty. for (auto &B : *ProfileBuffers) - deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); + deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); ProfileBuffers->trim(ProfileBuffers->size()); if (ThreadTries->empty()) @@ -278,8 +274,8 @@ void reset() XRAY_NEVER_INSTRUMENT { GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage); - new (GlobalAllocators) FunctionCallTrie::Allocators(); - *GlobalAllocators = FunctionCallTrie::InitAllocators(); + new (GlobalAllocators) + FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); if (ThreadTriesAllocator != nullptr) ThreadTriesAllocator->~ThreadTriesArrayAllocator(); @@ -312,8 +308,10 @@ XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { static pthread_once_t Once = PTHREAD_ONCE_INIT; static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type FileHeaderStorage; - pthread_once(&Once, - +[] { new (&FileHeaderStorage) XRayProfilingFileHeader{}; }); + pthread_once( + &Once, +[]() XRAY_NEVER_INSTRUMENT { + new (&FileHeaderStorage) XRayProfilingFileHeader{}; + }); if (UNLIKELY(B.Data == nullptr)) { // The first buffer should always contain the file header information. diff --git a/compiler-rt/lib/xray/xray_profiling.cc b/compiler-rt/lib/xray/xray_profiling.cc index 4b9222a13f6..6db4b6ff9a0 100644 --- a/compiler-rt/lib/xray/xray_profiling.cc +++ b/compiler-rt/lib/xray/xray_profiling.cc @@ -31,67 +31,112 @@ namespace __xray { namespace { -atomic_sint32_t ProfilerLogFlushStatus = { +static atomic_sint32_t ProfilerLogFlushStatus = { XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; -atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; +static atomic_sint32_t ProfilerLogStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; -SpinMutex ProfilerOptionsMutex; +static SpinMutex ProfilerOptionsMutex; -struct alignas(64) ProfilingData { - FunctionCallTrie::Allocators *Allocators; - FunctionCallTrie *FCT; +struct ProfilingData { + atomic_uintptr_t Allocators; + atomic_uintptr_t FCT; }; static pthread_key_t ProfilingKey; -thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type +thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators), + alignof(FunctionCallTrie::Allocators)>::type AllocatorsStorage; -thread_local std::aligned_storage<sizeof(FunctionCallTrie)>::type +thread_local std::aligned_storage<sizeof(FunctionCallTrie), + alignof(FunctionCallTrie)>::type FunctionCallTrieStorage; -thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{}; - -static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { - thread_local auto ThreadOnce = [] { - new (&ThreadStorage) ProfilingData{}; - auto *Allocators = - reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage); - new (Allocators) FunctionCallTrie::Allocators(); - *Allocators = FunctionCallTrie::InitAllocators(); - auto *FCT = reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage); - new (FCT) FunctionCallTrie(*Allocators); - auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage); - TLD.Allocators = Allocators; - TLD.FCT = FCT; - pthread_setspecific(ProfilingKey, &ThreadStorage); +thread_local ProfilingData TLD{{0}, {0}}; +thread_local atomic_uint8_t ReentranceGuard{0}; + +// We use a separate guard for ensuring that for this thread, if we're already +// cleaning up, that any signal handlers don't attempt to cleanup nor +// initialise. +thread_local atomic_uint8_t TLDInitGuard{0}; + +// We also use a separate latch to signal that the thread is exiting, and +// non-essential work should be ignored (things like recording events, etc.). +thread_local atomic_uint8_t ThreadExitingLatch{0}; + +static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT { + thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT { + pthread_setspecific(ProfilingKey, &TLD); return false; }(); (void)ThreadOnce; - auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage); - - if (UNLIKELY(TLD.Allocators == nullptr || TLD.FCT == nullptr)) { - auto *Allocators = - reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage); - new (Allocators) FunctionCallTrie::Allocators(); - *Allocators = FunctionCallTrie::InitAllocators(); - auto *FCT = reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage); - new (FCT) FunctionCallTrie(*Allocators); - TLD.Allocators = Allocators; - TLD.FCT = FCT; + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return nullptr; + + if (atomic_load_relaxed(&ThreadExitingLatch)) + return nullptr; + + uptr Allocators = 0; + if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1, + memory_order_acq_rel)) { + new (&AllocatorsStorage) + FunctionCallTrie::Allocators(FunctionCallTrie::InitAllocators()); + Allocators = reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)); + atomic_store(&TLD.Allocators, Allocators, memory_order_release); } - return *reinterpret_cast<ProfilingData *>(&ThreadStorage); + uptr FCT = 0; + if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) { + new (&FunctionCallTrieStorage) FunctionCallTrie( + *reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)); + FCT = reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)); + atomic_store(&TLD.FCT, FCT, memory_order_release); + } + + if (FCT == 1) + return nullptr; + + return &TLD; } static void cleanupTLD() XRAY_NEVER_INSTRUMENT { - auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage); - if (TLD.Allocators != nullptr && TLD.FCT != nullptr) { - TLD.FCT->~FunctionCallTrie(); - TLD.Allocators->~Allocators(); - TLD.FCT = nullptr; - TLD.Allocators = nullptr; - } + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return; + + auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel); + if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>( + &FunctionCallTrieStorage))) + reinterpret_cast<FunctionCallTrie *>(FCT)->~FunctionCallTrie(); + + auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel); + if (Allocators == + reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage))) + reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)->~Allocators(); +} + +static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT { + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return; + + uptr P = atomic_load(&T.FCT, memory_order_acquire); + if (P != reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage))) + return; + + auto FCT = reinterpret_cast<FunctionCallTrie *>(P); + DCHECK_NE(FCT, nullptr); + + if (!FCT->getRoots().empty()) + profileCollectorService::post(*FCT, GetTid()); + + cleanupTLD(); } } // namespace @@ -104,9 +149,6 @@ const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { #endif } -atomic_sint32_t ProfileFlushStatus = { - XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; - XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { if (atomic_load(&ProfilerLogStatus, memory_order_acquire) != XRayLogInitStatus::XRAY_LOG_FINALIZED) { @@ -115,14 +157,27 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; } - s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; - if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result, - XRayLogFlushStatus::XRAY_LOG_FLUSHING, - memory_order_acq_rel)) { + RecursionGuard SignalGuard(ReentranceGuard); + if (!SignalGuard) { if (Verbosity()) - Report("Not flushing profiles, implementation still finalizing.\n"); + Report("Cannot finalize properly inside a signal handler!\n"); + atomic_store(&ProfilerLogFlushStatus, + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING, + memory_order_release); + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; } + s32 Previous = atomic_exchange(&ProfilerLogFlushStatus, + XRayLogFlushStatus::XRAY_LOG_FLUSHING, + memory_order_acq_rel); + if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) { + if (Verbosity()) + Report("Not flushing profiles, implementation still flushing.\n"); + return XRayLogFlushStatus::XRAY_LOG_FLUSHING; + } + + postCurrentThreadFCT(TLD); + // At this point, we'll create the file that will contain the profile, but // only if the options say so. if (!profilingFlags()->no_flush) { @@ -150,33 +205,19 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { } } - profileCollectorService::reset(); - - // Flush the current thread's local data structures as well. + // Clean up the current thread's TLD information as well. cleanupTLD(); + profileCollectorService::reset(); + + atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + memory_order_release); atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, memory_order_release); return XRayLogFlushStatus::XRAY_LOG_FLUSHED; } -namespace { - -thread_local atomic_uint8_t ReentranceGuard{0}; - -static void postCurrentThreadFCT(ProfilingData &TLD) XRAY_NEVER_INSTRUMENT { - if (TLD.Allocators == nullptr || TLD.FCT == nullptr) - return; - - if (!TLD.FCT->getRoots().empty()) - profileCollectorService::post(*TLD.FCT, GetTid()); - - cleanupTLD(); -} - -} // namespace - void profilingHandleArg0(int32_t FuncId, XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { unsigned char CPU; @@ -186,22 +227,29 @@ void profilingHandleArg0(int32_t FuncId, return; auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire); + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED || + Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING)) + return; + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED || Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) { - auto &TLD = getThreadLocalData(); postCurrentThreadFCT(TLD); return; } - auto &TLD = getThreadLocalData(); + auto T = getThreadLocalData(); + if (T == nullptr) + return; + + auto FCT = reinterpret_cast<FunctionCallTrie *>(atomic_load_relaxed(&T->FCT)); switch (Entry) { case XRayEntryType::ENTRY: case XRayEntryType::LOG_ARGS_ENTRY: - TLD.FCT->enterFunction(FuncId, TSC, CPU); + FCT->enterFunction(FuncId, TSC, CPU); break; case XRayEntryType::EXIT: case XRayEntryType::TAIL: - TLD.FCT->exitFunction(FuncId, TSC, CPU); + FCT->exitFunction(FuncId, TSC, CPU); break; default: // FIXME: Handle bugs. @@ -227,15 +275,14 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { // Wait a grace period to allow threads to see that we're finalizing. SleepForMillis(profilingFlags()->grace_period_ms); - // We also want to make sure that the current thread's data is cleaned up, if - // we have any. We need to ensure that the call to postCurrentThreadFCT() is - // guarded by our recursion guard. - auto &TLD = getThreadLocalData(); - { - RecursionGuard G(ReentranceGuard); - if (G) - postCurrentThreadFCT(TLD); - } + // If we for some reason are entering this function from an instrumented + // handler, we bail out. + RecursionGuard G(ReentranceGuard); + if (!G) + return static_cast<XRayLogInitStatus>(CurrentStatus); + + // Post the current thread's data if we have any. + postCurrentThreadFCT(TLD); // Then we force serialize the log data. profileCollectorService::serialize(); @@ -248,6 +295,10 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { XRayLogInitStatus profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + RecursionGuard G(ReentranceGuard); + if (!G) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZING, @@ -282,39 +333,51 @@ profilingLoggingInit(UNUSED size_t BufferSize, UNUSED size_t BufferMax, // We need to set up the exit handlers. static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { - pthread_key_create(&ProfilingKey, +[](void *P) { - // This is the thread-exit handler. - auto &TLD = *reinterpret_cast<ProfilingData *>(P); - if (TLD.Allocators == nullptr && TLD.FCT == nullptr) - return; - - { - // If we're somehow executing this while inside a non-reentrant-friendly - // context, we skip attempting to post the current thread's data. - RecursionGuard G(ReentranceGuard); - if (G) - postCurrentThreadFCT(TLD); - } - }); - - // We also need to set up an exit handler, so that we can get the profile - // information at exit time. We use the C API to do this, to not rely on C++ - // ABI functions for registering exit handlers. - Atexit(+[] { - // Finalize and flush. - if (profilingFinalize() != XRAY_LOG_FINALIZED) { - cleanupTLD(); - return; - } - if (profilingFlush() != XRAY_LOG_FLUSHED) { - cleanupTLD(); - return; - } - if (Verbosity()) - Report("XRay Profile flushed at exit."); - }); - }); + pthread_once( + &Once, +[] { + pthread_key_create( + &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT { + if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel)) + return; + + if (P == nullptr) + return; + + auto T = reinterpret_cast<ProfilingData *>(P); + if (atomic_load_relaxed(&T->Allocators) == 0) + return; + + { + // If we're somehow executing this while inside a + // non-reentrant-friendly context, we skip attempting to post + // the current thread's data. + RecursionGuard G(ReentranceGuard); + if (!G) + return; + + postCurrentThreadFCT(*T); + } + }); + + // We also need to set up an exit handler, so that we can get the + // profile information at exit time. We use the C API to do this, to not + // rely on C++ ABI functions for registering exit handlers. + Atexit(+[]() XRAY_NEVER_INSTRUMENT { + if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel)) + return; + + auto Cleanup = + at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); }); + + // Finalize and flush. + if (profilingFinalize() != XRAY_LOG_FINALIZED || + profilingFlush() != XRAY_LOG_FLUSHED) + return; + + if (Verbosity()) + Report("XRay Profile flushed at exit."); + }); + }); __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer); __xray_set_handler(profilingHandleArg0); diff --git a/compiler-rt/lib/xray/xray_segmented_array.h b/compiler-rt/lib/xray/xray_segmented_array.h index 42f53be1e01..d4feace381c 100644 --- a/compiler-rt/lib/xray/xray_segmented_array.h +++ b/compiler-rt/lib/xray/xray_segmented_array.h @@ -32,14 +32,9 @@ namespace __xray { /// is destroyed. When an Array is destroyed, it will destroy elements in the /// backing store but will not free the memory. template <class T> class Array { - struct SegmentBase { - SegmentBase *Prev; - SegmentBase *Next; - }; - - // We want each segment of the array to be cache-line aligned, and elements of - // the array be offset from the beginning of the segment. - struct Segment : SegmentBase { + struct Segment { + Segment *Prev; + Segment *Next; char Data[1]; }; @@ -62,91 +57,35 @@ public: // kCacheLineSize-multiple segments, minus the size of two pointers. // // - Request cacheline-multiple sized elements from the allocator. - static constexpr size_t AlignedElementStorageSize = + static constexpr uint64_t AlignedElementStorageSize = sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type); - static constexpr size_t SegmentSize = - nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize); + static constexpr uint64_t SegmentControlBlockSize = sizeof(Segment *) * 2; + + static constexpr uint64_t SegmentSize = nearest_boundary( + SegmentControlBlockSize + next_pow2(sizeof(T)), kCacheLineSize); using AllocatorType = Allocator<SegmentSize>; - static constexpr size_t ElementsPerSegment = - (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T)); + static constexpr uint64_t ElementsPerSegment = + (SegmentSize - SegmentControlBlockSize) / next_pow2(sizeof(T)); static_assert(ElementsPerSegment > 0, "Must have at least 1 element per segment."); - static SegmentBase SentinelSegment; + static Segment SentinelSegment; - using size_type = size_t; + using size_type = uint64_t; private: - AllocatorType *Alloc; - SegmentBase *Head = &SentinelSegment; - SegmentBase *Tail = &SentinelSegment; - size_t Size = 0; - - // Here we keep track of segments in the freelist, to allow us to re-use - // segments when elements are trimmed off the end. - SegmentBase *Freelist = &SentinelSegment; - - Segment *NewSegment() XRAY_NEVER_INSTRUMENT { - // We need to handle the case in which enough elements have been trimmed to - // allow us to re-use segments we've allocated before. For this we look into - // the Freelist, to see whether we need to actually allocate new blocks or - // just re-use blocks we've already seen before. - if (Freelist != &SentinelSegment) { - auto *FreeSegment = Freelist; - Freelist = FreeSegment->Next; - FreeSegment->Next = &SentinelSegment; - Freelist->Prev = &SentinelSegment; - return static_cast<Segment *>(FreeSegment); - } - - auto SegmentBlock = Alloc->Allocate(); - if (SegmentBlock.Data == nullptr) - return nullptr; - - // Placement-new the Segment element at the beginning of the SegmentBlock. - auto S = reinterpret_cast<Segment *>(SegmentBlock.Data); - new (S) SegmentBase{&SentinelSegment, &SentinelSegment}; - return S; - } - - Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT { - DCHECK_EQ(Head, &SentinelSegment); - DCHECK_EQ(Tail, &SentinelSegment); - auto Segment = NewSegment(); - if (Segment == nullptr) - return nullptr; - DCHECK_EQ(Segment->Next, &SentinelSegment); - DCHECK_EQ(Segment->Prev, &SentinelSegment); - Head = Tail = static_cast<SegmentBase *>(Segment); - return Segment; - } - - Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT { - auto S = NewSegment(); - if (S == nullptr) - return nullptr; - DCHECK_NE(Tail, &SentinelSegment); - DCHECK_EQ(Tail->Next, &SentinelSegment); - DCHECK_EQ(S->Prev, &SentinelSegment); - DCHECK_EQ(S->Next, &SentinelSegment); - Tail->Next = S; - S->Prev = Tail; - Tail = S; - return static_cast<Segment *>(Tail); - } - // This Iterator models a BidirectionalIterator. template <class U> class Iterator { - SegmentBase *S = &SentinelSegment; - size_t Offset = 0; - size_t Size = 0; + Segment *S = &SentinelSegment; + uint64_t Offset = 0; + uint64_t Size = 0; public: - Iterator(SegmentBase *IS, size_t Off, size_t S) XRAY_NEVER_INSTRUMENT + Iterator(Segment *IS, uint64_t Off, uint64_t S) XRAY_NEVER_INSTRUMENT : S(IS), Offset(Off), Size(S) {} @@ -215,7 +154,7 @@ private: // We need to compute the character-aligned pointer, offset from the // segment's Data location to get the element in the position of Offset. - auto Base = static_cast<Segment *>(S)->Data; + auto Base = &S->Data; auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize); return *reinterpret_cast<U *>(AlignedOffset); } @@ -223,17 +162,183 @@ private: U *operator->() const XRAY_NEVER_INSTRUMENT { return &(**this); } }; + AllocatorType *Alloc; + Segment *Head; + Segment *Tail; + + // Here we keep track of segments in the freelist, to allow us to re-use + // segments when elements are trimmed off the end. + Segment *Freelist; + uint64_t Size; + + // =============================== + // In the following implementation, we work through the algorithms and the + // list operations using the following notation: + // + // - pred(s) is the predecessor (previous node accessor) and succ(s) is + // the successor (next node accessor). + // + // - S is a sentinel segment, which has the following property: + // + // pred(S) == succ(S) == S + // + // - @ is a loop operator, which can imply pred(s) == s if it appears on + // the left of s, or succ(s) == S if it appears on the right of s. + // + // - sL <-> sR : means a bidirectional relation between sL and sR, which + // means: + // + // succ(sL) == sR && pred(SR) == sL + // + // - sL -> sR : implies a unidirectional relation between sL and SR, + // with the following properties: + // + // succ(sL) == sR + // + // sL <- sR : implies a unidirectional relation between sR and sL, + // with the following properties: + // + // pred(sR) == sL + // + // =============================== + + Segment *NewSegment() XRAY_NEVER_INSTRUMENT { + // We need to handle the case in which enough elements have been trimmed to + // allow us to re-use segments we've allocated before. For this we look into + // the Freelist, to see whether we need to actually allocate new blocks or + // just re-use blocks we've already seen before. + if (Freelist != &SentinelSegment) { + // The current state of lists resemble something like this at this point: + // + // Freelist: @S@<-f0->...<->fN->@S@ + // ^ Freelist + // + // We want to perform a splice of `f0` from Freelist to a temporary list, + // which looks like: + // + // Templist: @S@<-f0->@S@ + // ^ FreeSegment + // + // Our algorithm preconditions are: + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + + // Then the algorithm we implement is: + // + // SFS = Freelist + // Freelist = succ(Freelist) + // if (Freelist != S) + // pred(Freelist) = S + // succ(SFS) = S + // pred(SFS) = S + // + auto *FreeSegment = Freelist; + Freelist = Freelist->Next; + + // Note that we need to handle the case where Freelist is now pointing to + // S, which we don't want to be overwriting. + // TODO: Determine whether the cost of the branch is higher than the cost + // of the blind assignment. + if (Freelist != &SentinelSegment) + Freelist->Prev = &SentinelSegment; + + FreeSegment->Next = &SentinelSegment; + FreeSegment->Prev = &SentinelSegment; + + // Our postconditions are: + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + DCHECK_NE(FreeSegment, &SentinelSegment); + return FreeSegment; + } + + auto SegmentBlock = Alloc->Allocate(); + if (SegmentBlock.Data == nullptr) + return nullptr; + + // Placement-new the Segment element at the beginning of the SegmentBlock. + new (SegmentBlock.Data) Segment{&SentinelSegment, &SentinelSegment, {0}}; + auto SB = reinterpret_cast<Segment *>(SegmentBlock.Data); + return SB; + } + + Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT { + DCHECK_EQ(Head, &SentinelSegment); + DCHECK_EQ(Tail, &SentinelSegment); + auto S = NewSegment(); + if (S == nullptr) + return nullptr; + DCHECK_EQ(S->Next, &SentinelSegment); + DCHECK_EQ(S->Prev, &SentinelSegment); + DCHECK_NE(S, &SentinelSegment); + Head = S; + Tail = S; + DCHECK_EQ(Head, Tail); + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Tail->Prev, &SentinelSegment); + return S; + } + + Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT { + auto S = NewSegment(); + if (S == nullptr) + return nullptr; + DCHECK_NE(Tail, &SentinelSegment); + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(S->Prev, &SentinelSegment); + DCHECK_EQ(S->Next, &SentinelSegment); + S->Prev = Tail; + Tail->Next = S; + Tail = S; + DCHECK_EQ(S, S->Prev->Next); + DCHECK_EQ(Tail->Next, &SentinelSegment); + return S; + } + public: - explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT : Alloc(&A) {} + explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT + : Alloc(&A), + Head(&SentinelSegment), + Tail(&SentinelSegment), + Freelist(&SentinelSegment), + Size(0) {} + + Array() XRAY_NEVER_INSTRUMENT : Alloc(nullptr), + Head(&SentinelSegment), + Tail(&SentinelSegment), + Freelist(&SentinelSegment), + Size(0) {} Array(const Array &) = delete; - Array(Array &&O) NOEXCEPT : Alloc(O.Alloc), - Head(O.Head), - Tail(O.Tail), - Size(O.Size) { + Array &operator=(const Array &) = delete; + + Array(Array &&O) XRAY_NEVER_INSTRUMENT : Alloc(O.Alloc), + Head(O.Head), + Tail(O.Tail), + Freelist(O.Freelist), + Size(O.Size) { + O.Alloc = nullptr; + O.Head = &SentinelSegment; + O.Tail = &SentinelSegment; + O.Size = 0; + O.Freelist = &SentinelSegment; + } + + Array &operator=(Array &&O) XRAY_NEVER_INSTRUMENT { + Alloc = O.Alloc; + O.Alloc = nullptr; + Head = O.Head; O.Head = &SentinelSegment; + Tail = O.Tail; O.Tail = &SentinelSegment; + Freelist = O.Freelist; + O.Freelist = &SentinelSegment; + Size = O.Size; O.Size = 0; + return *this; + } + + ~Array() XRAY_NEVER_INSTRUMENT { + for (auto &E : *this) + (&E)->~T(); } bool empty() const XRAY_NEVER_INSTRUMENT { return Size == 0; } @@ -243,52 +348,71 @@ public: return *Alloc; } - size_t size() const XRAY_NEVER_INSTRUMENT { return Size; } + uint64_t size() const XRAY_NEVER_INSTRUMENT { return Size; } - T *Append(const T &E) XRAY_NEVER_INSTRUMENT { - if (UNLIKELY(Head == &SentinelSegment)) - if (InitHeadAndTail() == nullptr) + template <class... Args> + T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT { + DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + if (UNLIKELY(Head == &SentinelSegment)) { + auto R = InitHeadAndTail(); + if (R == nullptr) return nullptr; + } + + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); auto Offset = Size % ElementsPerSegment; if (UNLIKELY(Size != 0 && Offset == 0)) if (AppendNewSegment() == nullptr) return nullptr; - auto Base = static_cast<Segment *>(Tail)->Data; + DCHECK_NE(Tail, &SentinelSegment); + auto Base = &Tail->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); - auto Position = reinterpret_cast<T *>(AlignedOffset); - *Position = E; + DCHECK_LE(AlignedOffset + sizeof(T), + reinterpret_cast<unsigned char *>(Tail) + SegmentSize); + + // In-place construct at Position. + new (AlignedOffset) T{std::forward<Args>(args)...}; ++Size; - return Position; + return reinterpret_cast<T *>(AlignedOffset); } - template <class... Args> - T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT { - if (UNLIKELY(Head == &SentinelSegment)) - if (InitHeadAndTail() == nullptr) + T *Append(const T &E) XRAY_NEVER_INSTRUMENT { + // FIXME: This is a duplication of AppenEmplace with the copy semantics + // explicitly used, as a work-around to GCC 4.8 not invoking the copy + // constructor with the placement new with braced-init syntax. + DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + if (UNLIKELY(Head == &SentinelSegment)) { + auto R = InitHeadAndTail(); + if (R == nullptr) return nullptr; + } + + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); auto Offset = Size % ElementsPerSegment; - auto *LatestSegment = Tail; - if (UNLIKELY(Size != 0 && Offset == 0)) { - LatestSegment = AppendNewSegment(); - if (LatestSegment == nullptr) + if (UNLIKELY(Size != 0 && Offset == 0)) + if (AppendNewSegment() == nullptr) return nullptr; - } DCHECK_NE(Tail, &SentinelSegment); - auto Base = static_cast<Segment *>(LatestSegment)->Data; + auto Base = &Tail->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); - auto Position = reinterpret_cast<T *>(AlignedOffset); + DCHECK_LE(AlignedOffset + sizeof(T), + reinterpret_cast<unsigned char *>(Tail) + SegmentSize); // In-place construct at Position. - new (Position) T{std::forward<Args>(args)...}; + new (AlignedOffset) T(E); ++Size; - return reinterpret_cast<T *>(Position); + return reinterpret_cast<T *>(AlignedOffset); } - T &operator[](size_t Offset) const XRAY_NEVER_INSTRUMENT { + T &operator[](uint64_t Offset) const XRAY_NEVER_INSTRUMENT { DCHECK_LE(Offset, Size); // We need to traverse the array enough times to find the element at Offset. auto S = Head; @@ -297,7 +421,7 @@ public: Offset -= ElementsPerSegment; DCHECK_NE(S, &SentinelSegment); } - auto Base = static_cast<Segment *>(S)->Data; + auto Base = &S->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); auto Position = reinterpret_cast<T *>(AlignedOffset); return *reinterpret_cast<T *>(Position); @@ -332,41 +456,172 @@ public: /// Remove N Elements from the end. This leaves the blocks behind, and not /// require allocation of new blocks for new elements added after trimming. - void trim(size_t Elements) XRAY_NEVER_INSTRUMENT { - if (Elements == 0) - return; - + void trim(uint64_t Elements) XRAY_NEVER_INSTRUMENT { auto OldSize = Size; - Elements = Elements >= Size ? Size : Elements; + Elements = Elements > Size ? Size : Elements; Size -= Elements; - DCHECK_NE(Head, &SentinelSegment); - DCHECK_NE(Tail, &SentinelSegment); - - for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) - - nearest_boundary(Size, ElementsPerSegment)) / - ElementsPerSegment; - SegmentsToTrim > 0; --SegmentsToTrim) { - - // We want to short-circuit if the trace is already empty. - if (Head == &SentinelSegment && Head == Tail) - return; - - // Put the tail into the Freelist. - auto *FreeSegment = Tail; - Tail = Tail->Prev; - if (Tail == &SentinelSegment) - Head = Tail; - else - Tail->Next = &SentinelSegment; - + // We compute the number of segments we're going to return from the tail by + // counting how many elements have been trimmed. Given the following: + // + // - Each segment has N valid positions, where N > 0 + // - The previous size > current size + // + // To compute the number of segments to return, we need to perform the + // following calculations for the number of segments required given 'x' + // elements: + // + // f(x) = { + // x == 0 : 0 + // , 0 < x <= N : 1 + // , N < x <= max : x / N + (x % N ? 1 : 0) + // } + // + // We can simplify this down to: + // + // f(x) = { + // x == 0 : 0, + // , 0 < x <= max : x / N + (x < N || x % N ? 1 : 0) + // } + // + // And further down to: + // + // f(x) = x ? x / N + (x < N || x % N ? 1 : 0) : 0 + // + // We can then perform the following calculation `s` which counts the number + // of segments we need to remove from the end of the data structure: + // + // s(p, c) = f(p) - f(c) + // + // If we treat p = previous size, and c = current size, and given the + // properties above, the possible range for s(...) is [0..max(typeof(p))/N] + // given that typeof(p) == typeof(c). + auto F = [](uint64_t X) { + return X ? (X / ElementsPerSegment) + + (X < ElementsPerSegment || X % ElementsPerSegment ? 1 : 0) + : 0; + }; + auto PS = F(OldSize); + auto CS = F(Size); + DCHECK_GE(PS, CS); + auto SegmentsToTrim = PS - CS; + for (auto I = 0uL; I < SegmentsToTrim; ++I) { + // Here we place the current tail segment to the freelist. To do this + // appropriately, we need to perform a splice operation on two + // bidirectional linked-lists. In particular, we have the current state of + // the doubly-linked list of segments: + // + // @S@ <- s0 <-> s1 <-> ... <-> sT -> @S@ + // + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); DCHECK_EQ(Tail->Next, &SentinelSegment); - FreeSegment->Next = Freelist; - FreeSegment->Prev = &SentinelSegment; - if (Freelist != &SentinelSegment) - Freelist->Prev = FreeSegment; - Freelist = FreeSegment; + + if (Freelist == &SentinelSegment) { + // Our two lists at this point are in this configuration: + // + // Freelist: (potentially) @S@ + // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@ + // ^ Head ^ Tail + // + // The end state for us will be this configuration: + // + // Freelist: @S@<-sT->@S@ + // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@ + // ^ Head ^ Tail + // + // The first step for us is to hold a reference to the tail of Mainlist, + // which in our notation is represented by sT. We call this our "free + // segment" which is the segment we are placing on the Freelist. + // + // sF = sT + // + // Then, we also hold a reference to the "pre-tail" element, which we + // call sPT: + // + // sPT = pred(sT) + // + // We want to splice sT into the beginning of the Freelist, which in + // an empty Freelist means placing a segment whose predecessor and + // successor is the sentinel segment. + // + // The splice operation then can be performed in the following + // algorithm: + // + // succ(sPT) = S + // pred(sT) = S + // succ(sT) = Freelist + // Freelist = sT + // Tail = sPT + // + auto SPT = Tail->Prev; + SPT->Next = &SentinelSegment; + Tail->Prev = &SentinelSegment; + Tail->Next = Freelist; + Freelist = Tail; + Tail = SPT; + + // Our post-conditions here are: + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + } else { + // In the other case, where the Freelist is not empty, we perform the + // following transformation instead: + // + // This transforms the current state: + // + // Freelist: @S@<-f0->@S@ + // ^ Freelist + // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@ + // ^ Head ^ Tail + // + // Into the following: + // + // Freelist: @S@<-sT<->f0->@S@ + // ^ Freelist + // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@ + // ^ Head ^ Tail + // + // The algorithm is: + // + // sFH = Freelist + // sPT = pred(sT) + // pred(SFH) = sT + // succ(sT) = Freelist + // pred(sT) = S + // succ(sPT) = S + // Tail = sPT + // Freelist = sT + // + auto SFH = Freelist; + auto SPT = Tail->Prev; + auto ST = Tail; + SFH->Prev = ST; + ST->Next = Freelist; + ST->Prev = &SentinelSegment; + SPT->Next = &SentinelSegment; + Tail = SPT; + Freelist = ST; + + // Our post-conditions here are: + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + DCHECK_EQ(Freelist->Next->Prev, Freelist); + } } + + // Now in case we've spliced all the segments in the end, we ensure that the + // main list is "empty", or both the head and tail pointing to the sentinel + // segment. + if (Tail == &SentinelSegment) + Head = Tail; + + DCHECK( + (Size == 0 && Head == &SentinelSegment && Tail == &SentinelSegment) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + DCHECK( + (Freelist != &SentinelSegment && Freelist->Prev == &SentinelSegment) || + (Freelist == &SentinelSegment && Tail->Next == &SentinelSegment)); } // Provide iterators. @@ -388,8 +643,8 @@ public: // ensure that storage for the SentinelSegment is defined and has a single // address. template <class T> -typename Array<T>::SegmentBase Array<T>::SentinelSegment{ - &Array<T>::SentinelSegment, &Array<T>::SentinelSegment}; +typename Array<T>::Segment Array<T>::SentinelSegment{ + &Array<T>::SentinelSegment, &Array<T>::SentinelSegment, {'\0'}}; } // namespace __xray |