diff options
Diffstat (limited to 'compiler-rt/lib/xray/xray_profile_collector.cc')
-rw-r--r-- | compiler-rt/lib/xray/xray_profile_collector.cc | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/compiler-rt/lib/xray/xray_profile_collector.cc b/compiler-rt/lib/xray/xray_profile_collector.cc new file mode 100644 index 00000000000..28eb096f48e --- /dev/null +++ b/compiler-rt/lib/xray/xray_profile_collector.cc @@ -0,0 +1,285 @@ +//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the interface for the profileCollectorService. +// +//===----------------------------------------------------------------------===// +#include "xray_profile_collector.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_vector.h" +#include "xray_profiler_flags.h" +#include <memory> +#include <utility> + +namespace __xray { +namespace profileCollectorService { + +namespace { + +SpinMutex GlobalMutex; +struct ThreadTrie { + tid_t TId; + FunctionCallTrie *Trie; +}; +Vector<ThreadTrie> ThreadTries; + +struct ProfileBuffer { + void *Data; + size_t Size; +}; +Vector<ProfileBuffer> ProfileBuffers; + +struct BlockHeader { + u32 BlockSize; + u32 BlockNum; + u64 ThreadId; +}; + +FunctionCallTrie::Allocators *GlobalAllocators = nullptr; + +} // namespace + +void post(const FunctionCallTrie &T, tid_t TId) { + static pthread_once_t Once = PTHREAD_ONCE_INIT; + pthread_once(&Once, +[] { + SpinMutexLock Lock(&GlobalMutex); + GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); + }); + DCHECK_NE(GlobalAllocators, nullptr); + + ThreadTrie *Item = nullptr; + { + SpinMutexLock Lock(&GlobalMutex); + if (GlobalAllocators == nullptr) + return; + + Item = ThreadTries.PushBack(); + Item->TId = TId; + + // Here we're using the internal allocator instead of the managed allocator + // because: + // + // 1) We're not using the segmented array data structure to host + // FunctionCallTrie objects. We're using a Vector (from sanitizer_common) + // which works like a std::vector<...> keeping elements contiguous in + // memory. The segmented array data structure assumes that elements are + // trivially destructible, where FunctionCallTrie isn't. + // + // 2) Using a managed allocator means we need to manage that separately, + // which complicates the nature of this code. To get around that, we're + // using the internal allocator instead, which has its own global state + // and is decoupled from the lifetime management required by the managed + // allocator we have in XRay. + // + Item->Trie = reinterpret_cast<FunctionCallTrie *>( + InternalAlloc(sizeof(FunctionCallTrie))); + DCHECK_NE(Item->Trie, nullptr); + new (Item->Trie) FunctionCallTrie(*GlobalAllocators); + } + DCHECK_NE(Item, nullptr); + + T.deepCopyInto(*Item->Trie); +} + +// A PathArray represents the function id's representing a stack trace. In this +// context a path is almost always represented from the leaf function in a call +// stack to a root of the call trie. +using PathArray = Array<int32_t>; + +struct ProfileRecord { + using PathAllocator = typename PathArray::AllocatorType; + + // The Path in this record is the function id's from the leaf to the root of + // the function call stack as represented from a FunctionCallTrie. + PathArray *Path = nullptr; + const FunctionCallTrie::Node *Node = nullptr; + + // Constructor for in-place construction. + ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) + : Path([&] { + auto P = + reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray))); + new (P) PathArray(A); + return P; + }()), + Node(N) {} +}; + +namespace { + +using ProfileRecordArray = Array<ProfileRecord>; + +// Walk a depth-first traversal of each root of the FunctionCallTrie to generate +// the path(s) and the data associated with the path. +static void populateRecords(ProfileRecordArray &PRs, + ProfileRecord::PathAllocator &PA, + const FunctionCallTrie &Trie) { + using StackArray = Array<const FunctionCallTrie::Node *>; + using StackAllocator = typename StackArray::AllocatorType; + StackAllocator StackAlloc(profilerFlags()->stack_allocator_max, 0); + StackArray DFSStack(StackAlloc); + for (const auto R : Trie.getRoots()) { + DFSStack.Append(R); + while (!DFSStack.empty()) { + auto Node = DFSStack.back(); + DFSStack.trim(1); + auto Record = PRs.AppendEmplace(PA, Node); + DCHECK_NE(Record, nullptr); + + // Traverse the Node's parents and as we're doing so, get the FIds in + // the order they appear. + for (auto N = Node; N != nullptr; N = N->Parent) + Record->Path->Append(N->FId); + DCHECK(!Record->Path->empty()); + + for (const auto C : Node->Callees) + DFSStack.Append(C.NodePtr); + } + } +} + +static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, + const ProfileRecordArray &ProfileRecords) { + auto NextPtr = static_cast<char *>( + internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + + sizeof(Header); + for (const auto &Record : ProfileRecords) { + // List of IDs follow: + for (const auto FId : *Record.Path) + NextPtr = + static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + + sizeof(FId); + + // Add the sentinel here. + constexpr int32_t SentinelFId = 0; + NextPtr = static_cast<char *>( + internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + + sizeof(SentinelFId); + + // Add the node data here. + NextPtr = + static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount, + sizeof(Record.Node->CallCount))) + + sizeof(Record.Node->CallCount); + NextPtr = static_cast<char *>( + internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, + sizeof(Record.Node->CumulativeLocalTime))) + + sizeof(Record.Node->CumulativeLocalTime); + } + + DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size); +} + +} // namespace + +void serialize() { + SpinMutexLock Lock(&GlobalMutex); + + // Clear out the global ProfileBuffers. + for (uptr I = 0; I < ProfileBuffers.Size(); ++I) + InternalFree(ProfileBuffers[I].Data); + ProfileBuffers.Reset(); + + if (ThreadTries.Size() == 0) + return; + + // Then repopulate the global ProfileBuffers. + for (u32 I = 0; I < ThreadTries.Size(); ++I) { + using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; + ProfileRecordAllocator PRAlloc(profilerFlags()->global_allocator_max, 0); + ProfileRecord::PathAllocator PathAlloc( + profilerFlags()->global_allocator_max, 0); + ProfileRecordArray ProfileRecords(PRAlloc); + + // First, we want to compute the amount of space we're going to need. We'll + // use a local allocator and an __xray::Array<...> to store the intermediary + // data, then compute the size as we're going along. Then we'll allocate the + // contiguous space to contain the thread buffer data. + const auto &Trie = *ThreadTries[I].Trie; + if (Trie.getRoots().empty()) + continue; + populateRecords(ProfileRecords, PathAlloc, Trie); + DCHECK(!Trie.getRoots().empty()); + DCHECK(!ProfileRecords.empty()); + + // Go through each record, to compute the sizes. + // + // header size = block size (4 bytes) + // + block number (4 bytes) + // + thread id (8 bytes) + // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) + // + call count (8 bytes) + // + local time (8 bytes) + // + end of record (8 bytes) + u32 CumulativeSizes = 0; + for (const auto &Record : ProfileRecords) + CumulativeSizes += 20 + (4 * Record.Path->size()); + + BlockHeader Header{16 + CumulativeSizes, I, ThreadTries[I].TId}; + auto Buffer = ProfileBuffers.PushBack(); + Buffer->Size = sizeof(Header) + CumulativeSizes; + Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); + DCHECK_NE(Buffer->Data, nullptr); + serializeRecords(Buffer, Header, ProfileRecords); + + // Now clean up the ProfileRecords array, one at a time. + for (auto &Record : ProfileRecords) { + Record.Path->~PathArray(); + InternalFree(Record.Path); + } + } +} + +void reset() { + SpinMutexLock Lock(&GlobalMutex); + // Clear out the profile buffers that have been serialized. + for (uptr I = 0; I < ProfileBuffers.Size(); ++I) + InternalFree(ProfileBuffers[I].Data); + ProfileBuffers.Reset(); + + // Clear out the function call tries per thread. + for (uptr I = 0; I < ThreadTries.Size(); ++I) { + auto &T = ThreadTries[I]; + T.Trie->~FunctionCallTrie(); + InternalFree(T.Trie); + } + ThreadTries.Reset(); + + // Reset the global allocators. + if (GlobalAllocators != nullptr) { + GlobalAllocators->~Allocators(); + InternalFree(GlobalAllocators); + GlobalAllocators = nullptr; + } + GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); +} + +XRayBuffer nextBuffer(XRayBuffer B) { + SpinMutexLock Lock(&GlobalMutex); + if (B.Data == nullptr && ProfileBuffers.Size()) + return {ProfileBuffers[0].Data, ProfileBuffers[0].Size}; + + BlockHeader Header; + internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); + auto NextBlock = Header.BlockNum + 1; + if (NextBlock < ProfileBuffers.Size()) + return {ProfileBuffers[NextBlock].Data, ProfileBuffers[NextBlock].Size}; + return {nullptr, 0}; +} + +} // namespace profileCollectorService +} // namespace __xray |