From 7cc8294252577238eb99bad42c3bc7dd92f4794d Mon Sep 17 00:00:00 2001 From: Dan Crowell Date: Sat, 23 Sep 2017 22:24:28 -0500 Subject: Debug improvements for exceptions and OOM hangs There are two main changes in this commit: 1) Forcing an assert if we cannot allocate pages after 10,000 attempts to yield. 2) Adding a backtrace for a lot of exception paths. Change-Id: I755ada753b78abed56e553f7c669f0f98ae68700 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/58224 Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins Reviewed-by: Prachi Gupta Reviewed-by: Thi N. Tran Reviewed-by: William G. Hoffa --- src/kernel/exception.C | 6 ++++-- src/kernel/machchk.C | 2 +- src/kernel/misc.C | 45 +++++++++++++++++++++++++++++++++++++++++++++ src/kernel/pagemgr.C | 12 ++++++++---- 4 files changed, 58 insertions(+), 7 deletions(-) (limited to 'src/kernel') diff --git a/src/kernel/exception.C b/src/kernel/exception.C index cf2a35c81..ca05bf3a1 100644 --- a/src/kernel/exception.C +++ b/src/kernel/exception.C @@ -37,6 +37,7 @@ #include #include #include +#include namespace ExceptionHandles @@ -62,7 +63,8 @@ void kernel_execute_prog_ex() } if (!handled) { - printk("Program exception, killing task %d\n", t->tid); + printk( "Program exception, killing task %d, SRR0=0x%lX, SRR1=0x%lX\n", + t->tid, getSRR0(), getSRR1() ); MAGIC_INSTRUCTION(MAGIC_BREAK_ON_ERROR); TaskManager::endTask(t, NULL, TASK_STATUS_CRASHED); } @@ -107,7 +109,7 @@ void kernel_execute_data_storage() "Exception Type: %lx\n" "Instruction where it occurred: %p\n", t->tid, getDAR(), getDSISR(), t->context.nip); - MAGIC_INSTRUCTION(MAGIC_BREAK_ON_ERROR); + KernelMisc::printkBacktrace(t); TaskManager::endTask(t, NULL, TASK_STATUS_CRASHED); } } diff --git a/src/kernel/machchk.C b/src/kernel/machchk.C index 2a96b5896..776ede0ce 100644 --- a/src/kernel/machchk.C +++ b/src/kernel/machchk.C @@ -149,7 +149,7 @@ void setCheckstopData(uint64_t i_xstopAddr, uint64_t i_xstopData) g_xstopRegPtr = reinterpret_cast(i_xstopAddr |VmmManager::FORCE_PHYS_ADDR); g_xstopRegValue = i_xstopData; - printk( "Set MchChk Xstop: %p=%.16lX\n", g_xstopRegPtr, g_xstopRegValue ); + printk( "Arm MchChk Xstop: %p=%.16lX\n", g_xstopRegPtr, g_xstopRegValue ); // Now that the machine check handler can do the xscom we // can set MSR[ME]=1 to enable the regular machine check diff --git a/src/kernel/misc.C b/src/kernel/misc.C index b602ed707..f6aa69bda 100644 --- a/src/kernel/misc.C +++ b/src/kernel/misc.C @@ -585,6 +585,51 @@ namespace KernelMisc writeScratchReg(l_scratch_addr, data); }; + /** + * @brief Collect the backtrace for the given task and print an + */ + void printkBacktrace(task_t* i_task) + { + uint64_t* l_frame = nullptr; + uint32_t l_tid = 0; + bool l_kernelSpace = true; + if( i_task == nullptr ) //user-space + { + l_kernelSpace = false; + printk("U:"); + l_frame = static_cast(framePointer()); + l_tid = task_gettid(); + } + else //kernel-space + { + printk("K:"); + l_frame = reinterpret_cast( i_task->context.gprs[1] ); + l_tid = i_task->tid; + } + + printk("Backtrace for %d:\n ", l_tid ); + printkd("frame=%p\n",l_frame);isync(); + while (l_frame != NULL) + { + printkd("\nf=%p\n",l_frame); isync(); + if( l_kernelSpace ) + { + uint64_t* frame_p = reinterpret_cast + (VmmManager::findPhysicalAddress( reinterpret_cast + (l_frame) )); + printkd("frame_p=%p\n",frame_p); isync(); + l_frame = frame_p; + } + if( (0 != *l_frame) && (0 != l_frame[2]) ) + { + printk( "<-0x%lX", l_frame[2] ); + } + + l_frame = reinterpret_cast(*l_frame); + } + printk("\n"); + } + }; diff --git a/src/kernel/pagemgr.C b/src/kernel/pagemgr.C index f5c4d406a..42545470a 100644 --- a/src/kernel/pagemgr.C +++ b/src/kernel/pagemgr.C @@ -37,6 +37,7 @@ #include #include #include +#include size_t PageManager::cv_coalesce_count = 0; @@ -155,7 +156,7 @@ void* PageManager::allocatePage(size_t n, bool userspace) // In non-kernel mode, make a system-call to allocate in kernel-mode. if (!KernelMisc::in_kernel_mode()) { - size_t attempts = 0; + size_t l_attempts = 0; while (NULL == page) { page = _syscall1(Systemcalls::MM_ALLOC_PAGES, @@ -165,11 +166,14 @@ void* PageManager::allocatePage(size_t n, bool userspace) // will eventually free up (ex. VMM flushes). if (NULL == page) { - attempts++; - if( attempts == 10000 ) //arbitrarily huge number + l_attempts++; + if( l_attempts == 10000 ) { - printk("Cannot allocate %ld pages\n", n); + printk( "Cannot allocate %ld pages to %d!\n", + n, task_gettid() ); MAGIC_INSTRUCTION(MAGIC_BREAK_ON_ERROR); + KernelMisc::printkBacktrace(nullptr); + task_crash(); } task_yield(); } -- cgit v1.2.1