From 5d1b6b7b99c356d92098a72787aca71d50dbbc4b Mon Sep 17 00:00:00 2001 From: Erhan Kurubas Date: Mon, 11 Mar 2024 08:19:54 +0100 Subject: [PATCH] feat(coredump): save twdt panic output to coredump elf file --- components/esp_system/include/esp_task_wdt.h | 27 ++- components/esp_system/task_wdt/task_wdt.c | 134 ++++++++------ components/espcoredump/src/core_dump_elf.c | 174 +++++++++++++----- tools/test_apps/system/panic/pytest_panic.py | 17 ++ .../system/panic/test_panic_util/panic_dut.py | 19 +- 5 files changed, 271 insertions(+), 100 deletions(-) diff --git a/components/esp_system/include/esp_task_wdt.h b/components/esp_system/include/esp_task_wdt.h index c5c1d5fa11..776d29755e 100644 --- a/components/esp_system/include/esp_task_wdt.h +++ b/components/esp_system/include/esp_task_wdt.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2015-2022 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2015-2023 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -179,6 +179,31 @@ esp_err_t esp_task_wdt_status(TaskHandle_t task_handle); */ void __attribute__((weak)) esp_task_wdt_isr_user_handler(void); +typedef void (*task_wdt_msg_handler)(void *opaque, const char *msg); + +/** + * @brief Prints or retrieves information about tasks/users that triggered the Task Watchdog Timeout. + * + * This function provides various operations to handle tasks/users that did not reset the Task Watchdog in time. + * It can print detailed information about these tasks/users, such as their names, associated CPUs, and whether they have been reset. + * Additionally, it can retrieve the total length of the printed information or the CPU affinity of the failing tasks. + * + * @param[in] msg_handler Optional message handler function that will be called for each printed line. + * @param[in] opaque Optional pointer to opaque data that will be passed to the message handler function. + * @param[out] cpus_fail Optional pointer to an integer where the CPU affinity of the failing tasks will be stored. + * + * @return + * - ESP_OK: The function executed successfully. + * - ESP_FAIL: No triggered tasks were found, and thus no information was printed or retrieved. + * + * @note + * - If `msg_handler` is not provided, the information will be printed to console using ESP_EARLY_LOGE. + * - If `msg_handler` is provided, the function will send the printed information to the provided message handler function. + * - If `cpus_fail` is provided, the function will store the CPU affinity of the failing tasks in the provided integer. + * - During the execution of this function, logging is allowed in critical sections, as TWDT timeouts are considered fatal errors. + */ +esp_err_t esp_task_wdt_print_triggered_tasks(task_wdt_msg_handler msg_handler, void *opaque, int *cpus_fail); + #ifdef __cplusplus } #endif diff --git a/components/esp_system/task_wdt/task_wdt.c b/components/esp_system/task_wdt/task_wdt.c index 9bd7770ca2..f61db4ff6a 100644 --- a/components/esp_system/task_wdt/task_wdt.c +++ b/components/esp_system/task_wdt/task_wdt.c @@ -1,10 +1,11 @@ /* - * SPDX-FileCopyrightText: 2015-2022 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2015-2024 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include +#include #include #include #include @@ -331,6 +332,33 @@ static void subscribe_idle(uint32_t core_mask) * */ +static UBaseType_t get_task_affinity(const TaskHandle_t xTask) +{ + if (xTask == NULL) { + /* User entry, we cannot predict on which core it is scheduled to run, + * so let's mark all cores as failing */ +#if configNUM_CORES > 1 + return BIT(1) | BIT(0); +#else + return BIT(0); +#endif + } + +#if CONFIG_FREERTOS_SMP + #if configNUM_CORES > 1 + return vTaskCoreAffinityGet(xTask); + #else + return BIT(0); + #endif +#else + BaseType_t task_affinity = xTaskGetAffinity(xTask); + if (task_affinity == 0 || task_affinity == 1) { + return BIT(task_affinity); + } + return BIT(1) | BIT(0); +#endif +} + #if CONFIG_IDF_TARGET_ARCH_RISCV static void task_wdt_timeout_handling(int cores_fail, bool panic) @@ -505,65 +533,17 @@ static void task_wdt_isr(void *arg) portENTER_CRITICAL_ISR(&spinlock); esp_task_wdt_impl_timeout_triggered(p_twdt_obj->impl_ctx); - // If there are no entries, there's nothing to do. - if (SLIST_EMPTY(&p_twdt_obj->entries_slist)) { - portEXIT_CRITICAL_ISR(&spinlock); - return; - } - // Find what entries triggered the TWDT timeout (i.e., which entries have not been reset) - /* - Note: We are currently in a critical section, thus under normal circumstances, logging should not be allowed. - However, TWDT timeouts count as fatal errors, thus reporting the fatal error is considered more important than - minimizing interrupt latency. Thus we allow logging in critical sections in this narrow case. - */ - ESP_EARLY_LOGE(TAG, "Task watchdog got triggered. The following tasks/users did not reset the watchdog in time:"); - twdt_entry_t *entry; /* Keep a bitmap of CPU cores having tasks that have not reset TWDT. * Bit 0 represents core 0, bit 1 represents core 1, and so on. */ int cpus_fail = 0; bool panic = p_twdt_obj->panic; - SLIST_FOREACH(entry, &p_twdt_obj->entries_slist, slist_entry) { - if (!entry->has_reset) { - if (entry->task_handle) { -#if CONFIG_FREERTOS_SMP -#if configNUM_CORES > 1 - // Log the task's name and its affinity - const UBaseType_t affinity = vTaskCoreAffinityGet(entry->task_handle); - ESP_EARLY_LOGE(TAG, " - %s (0x%x)", pcTaskGetName(entry->task_handle), affinity); - cpus_fail |= affinity; -#else // configNUM_CORES > 1 - // Log the task's name - ESP_EARLY_LOGE(TAG, " - %s", pcTaskGetName(entry->task_handle)); - cpus_fail |= BIT(0); -#endif // configNUM_CORES > 1 -#else // CONFIG_FREERTOS_SMP - BaseType_t task_affinity = xTaskGetAffinity(entry->task_handle); - const char *cpu; - if (task_affinity == 0) { - cpu = DRAM_STR("CPU 0"); - cpus_fail |= BIT(0); - } else if (task_affinity == 1) { - cpu = DRAM_STR("CPU 1"); - cpus_fail |= BIT(1); - } else { - cpu = DRAM_STR("CPU 0/1"); - cpus_fail |= BIT(1) | BIT(0); - } - ESP_EARLY_LOGE(TAG, " - %s (%s)", pcTaskGetName(entry->task_handle), cpu); -#endif // CONFIG_FREERTOS_SMP - } else { - /* User entry, we cannot predict on which core it is scheduled to run, - * so let's mark all cores as failing */ -#if configNUM_CORES > 1 - cpus_fail = BIT(1) | BIT(0); -#else // configNUM_CORES > 1 - cpus_fail = BIT(0); -#endif // configNUM_CORES > 1 - ESP_EARLY_LOGE(TAG, " - %s", entry->user_name); - } - } - } + if (esp_task_wdt_print_triggered_tasks(NULL, NULL, &cpus_fail) != ESP_OK) { + // If there are no entries, there's nothing to do. + portEXIT_CRITICAL_ISR(&spinlock); + return; + } + ESP_EARLY_LOGE(TAG, "%s", DRAM_STR("Tasks currently running:")); for (int x = 0; x < portNUM_PROCESSORS; x++) { ESP_EARLY_LOGE(TAG, "CPU %d: %s", x, pcTaskGetName(xTaskGetCurrentTaskHandleForCPU(x))); @@ -855,3 +835,47 @@ esp_err_t esp_task_wdt_status(TaskHandle_t task_handle) return ret; } + +esp_err_t esp_task_wdt_print_triggered_tasks(task_wdt_msg_handler msg_handler, void *opaque, int *cpus_fail) +{ + if (SLIST_EMPTY(&p_twdt_obj->entries_slist)) { + return ESP_FAIL; + } + + twdt_entry_t *entry; + const char *caption = "Task watchdog got triggered. " + "The following tasks/users did not reset the watchdog in time:"; + + if (msg_handler == NULL) { + ESP_EARLY_LOGE(TAG, "%s", caption); + } else { + msg_handler(opaque, caption); + } + + // Find what entries triggered the TWDT timeout (i.e., which entries have not been reset) + SLIST_FOREACH(entry, &p_twdt_obj->entries_slist, slist_entry) { + if (!entry->has_reset) { + const char *cpu; + const char *name = entry->task_handle ? pcTaskGetName(entry->task_handle) : entry->user_name; + const UBaseType_t affinity = get_task_affinity(entry->task_handle); + if (cpus_fail) { + *cpus_fail |= affinity; + } + if (affinity == BIT(0)) { + cpu = " (CPU 0)"; + } else if (affinity == BIT(1)) { + cpu = " (CPU 1)"; + } else { + cpu = " (CPU 0/1)"; + } + if (msg_handler == NULL) { + ESP_EARLY_LOGE(TAG, " - %s%s", name, cpu); + } else { + msg_handler(opaque, "\n - "); + msg_handler(opaque, name); + msg_handler(opaque, cpu); + } + } + } + return ESP_OK; +} diff --git a/components/espcoredump/src/core_dump_elf.c b/components/espcoredump/src/core_dump_elf.c index 8a85c4d3f5..c5db612d59 100644 --- a/components/espcoredump/src/core_dump_elf.c +++ b/components/espcoredump/src/core_dump_elf.c @@ -12,6 +12,7 @@ #include "esp_core_dump_port.h" #include "esp_core_dump_port_impl.h" #include "esp_core_dump_common.h" +#include "esp_task_wdt.h" #ifdef CONFIG_ESP_COREDUMP_DATA_FORMAT_ELF #include // for the MIN macro @@ -33,6 +34,8 @@ #define ELF_NOTE_NAME_MAX_SIZE 32 #define ELF_APP_SHA256_SIZE 66 +#define ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME "ESP_PANIC_DETAILS" + #define ELF_CHECK_ERR(a, ret_val, str, ...) \ if (!(a)) { \ ESP_COREDUMP_LOGE("%s(%u): " str, __FUNCTION__, __LINE__, ##__VA_ARGS__); \ @@ -80,18 +83,23 @@ typedef struct _core_dump_elf_t uint32_t elf_next_data_offset; uint16_t segs_count; core_dump_write_data_t write_data; + uint32_t note_data_size; /* can be used where static storage needed */ } core_dump_elf_t; +typedef struct { + core_dump_elf_t *self; + uint32_t total_size; + bool size_only; +} core_dump_elf_opaque_t; + // Represents lightweight implementation to save core dump data into ELF formatted binary -#define ALIGN(b, var) var = align(b, var) - #if CONFIG_ESP_COREDUMP_DATA_FORMAT_ELF -static inline uint32_t align(uint32_t width, uint32_t in) -{ - return (in + (width - 1)) & -width; -} +#ifdef ALIGN_UP +#undef ALIGN_UP +#endif +#define ALIGN_UP(x, a) (((x) + (a) - 1) & ~((a) - 1)) esp_err_t esp_core_dump_store(void) __attribute__((alias("esp_core_dump_write_elf"))); @@ -158,13 +166,11 @@ static int elf_add_segment(core_dump_elf_t *self, { esp_err_t err = ESP_FAIL; elf_phdr seg_hdr = { 0 }; - int data_len = data_sz; + int data_len = ALIGN_UP(data_sz, 4); ELF_CHECK_ERR((data != NULL), ELF_PROC_ERR_OTHER, "Invalid data for segment."); - ALIGN(4, data_len); - if (self->elf_stage == ELF_STAGE_CALC_SPACE) { self->segs_count++; return data_len + sizeof(elf_phdr); @@ -193,6 +199,29 @@ static int elf_add_segment(core_dump_elf_t *self, return data_len; } +static int elf_write_note_header(core_dump_elf_t *self, + const char* name, uint32_t name_len, uint32_t data_sz, uint32_t type) +{ + // temporary aligned buffer for note name + static char name_buffer[ELF_NOTE_NAME_MAX_SIZE] = { 0 }; + elf_note note_hdr = { 0 }; + + memcpy((void*)name_buffer, (void*)name, name_len); + note_hdr.n_namesz = name_len; + note_hdr.n_descsz = data_sz; + note_hdr.n_type = type; + // write note header + esp_err_t err = esp_core_dump_write_data(&self->write_data, ¬e_hdr, sizeof(note_hdr)); + ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL, + "Write ELF note header failure (%d)", err); + // write note name + err = esp_core_dump_write_data(&self->write_data, name_buffer, name_len); + ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL, + "Write ELF note name failure (%d)", err); + + return err; +} + static int elf_write_note(core_dump_elf_t *self, const char* name, uint32_t type, @@ -200,38 +229,34 @@ static int elf_write_note(core_dump_elf_t *self, uint32_t data_sz) { esp_err_t err = ESP_FAIL; - // temporary buffer for note name - static char name_buffer[ELF_NOTE_NAME_MAX_SIZE] = { 0 }; - elf_note note_hdr = { 0 }; - uint32_t name_len = strlen(name) + 1; // get name length including terminator - uint32_t data_len = data_sz; + uint32_t name_len = ALIGN_UP(strlen(name) + 1, 4); // get name length including terminator + uint32_t data_len = ALIGN_UP(data_sz, 4); - ELF_CHECK_ERR(data, ELF_PROC_ERR_OTHER, - "Invalid data pointer %x.", (uint32_t)data); ELF_CHECK_ERR((name_len <= ELF_NOTE_NAME_MAX_SIZE), 0, "Segment note name is too long %d.", name_len); - ALIGN(4, data_len); - ALIGN(4, name_len); - uint32_t note_size = name_len + data_len + sizeof(elf_note); - ALIGN(4, note_size); + uint32_t note_size = ALIGN_UP(name_len + data_len + sizeof(elf_note), 4); // write segment data during second pass if (self->elf_stage == ELF_STAGE_PLACE_DATA) { - memcpy((void*)name_buffer, (void*)name, name_len); - note_hdr.n_namesz = name_len; - note_hdr.n_descsz = data_sz; - note_hdr.n_type = type; - // write note header - err = esp_core_dump_write_data(&self->write_data, (void*)¬e_hdr, sizeof(note_hdr)); - ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL, - "Write ELF note header failure (%d)", err); - // write note name - err = esp_core_dump_write_data(&self->write_data, (void*)name_buffer, name_len); - ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL, - "Write ELF note name failure (%d)", err); - // write note data - err = esp_core_dump_write_data(&self->write_data, (void*)data, data_len); + ELF_CHECK_ERR(data, ELF_PROC_ERR_OTHER, "Invalid data pointer %x.", (uint32_t)data); + err = elf_write_note_header(self, name, name_len, data_sz, type); + if (err != ESP_OK) { + return err; + } + + // note data must be aligned in memory. we write aligned byte structures and panic details in strings, + // which might not be aligned by default. Therefore, we need to verify alignment and add padding if necessary. + err = esp_core_dump_write_data(&self->write_data, data, data_sz); + if (err == ESP_OK) { + int pad_size = data_len - data_sz; + if (pad_size != 0) { + uint8_t pad_bytes[3] = {0}; + ESP_COREDUMP_LOG_PROCESS("Core dump note data needs %d bytes padding", pad_size); + err = esp_core_dump_write_data(&self->write_data, pad_bytes, pad_size); + } + } + ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL, "Write ELF note data failure (%d)", err); ESP_COREDUMP_LOG_PROCESS("Add note size=%d, start_off=0x%x", @@ -495,6 +520,65 @@ static int elf_write_core_dump_user_data(core_dump_elf_t *self) return total_sz; } +#if CONFIG_ESP_TASK_WDT_EN +static void elf_write_core_dump_note_cb(void *opaque, const char *data) +{ + core_dump_elf_opaque_t *param = opaque; + core_dump_elf_t *self = param->self; + const size_t data_len = strlen(data); + + ESP_COREDUMP_LOG_PROCESS("Core dump note cb data_len:(%d)", data_len); + + param->total_size += data_len; + + if (!param->size_only) { + esp_err_t err = esp_core_dump_write_data(&self->write_data, (void *)data, data_len); + if (err != ESP_OK) { + param->total_size = 0; + } + } +} + +static int elf_add_wdt_panic_details(core_dump_elf_t *self) +{ + uint32_t name_len = ALIGN_UP(sizeof(ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME), 4); + core_dump_elf_opaque_t param = { + .self = self, + .total_size = 0, + .size_only = false, + }; + + if (self->elf_stage == ELF_STAGE_CALC_SPACE) { + param.size_only = true; + esp_task_wdt_print_triggered_tasks(elf_write_core_dump_note_cb, ¶m, NULL); + ELF_CHECK_ERR((param.total_size > 0), ELF_PROC_ERR_OTHER, "wdt panic message len is zero!"); + self->note_data_size = param.total_size; + } else if (self->elf_stage == ELF_STAGE_PLACE_DATA) { + esp_err_t err = elf_write_note_header(self, + ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME, + name_len, + self->note_data_size, + ELF_ESP_CORE_DUMP_PANIC_DETAILS_TYPE); + if (err != ESP_OK) { + return err; + } + + esp_task_wdt_print_triggered_tasks(elf_write_core_dump_note_cb, ¶m, NULL); + ELF_CHECK_ERR((param.total_size > 0), ELF_PROC_ERR_WRITE_FAIL, "Write ELF note data failure (%d)", err); + const uint32_t mod = self->note_data_size & 3; + if (mod != 0) { + uint8_t pad_bytes[3] = {0}; + uint32_t pad_size = 4 - mod; + ESP_COREDUMP_LOG_PROCESS("Core dump note needs %d bytes padding", pad_size); + err = esp_core_dump_write_data(&self->write_data, pad_bytes, pad_size); + ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL, "Write ELF note padding failure (%d)", err); + } + } + + return ALIGN_UP(name_len + ALIGN_UP(self->note_data_size, 4) + sizeof(elf_note), 4); +} +#endif //CONFIG_ESP_TASK_WDT_EN + static int elf_write_core_dump_info(core_dump_elf_t *self) { void *extra_info = NULL; @@ -527,13 +611,22 @@ static int elf_write_core_dump_info(core_dump_elf_t *self) ELF_CHECK_ERR((ret > 0), ret, "Extra info note write failed. Returned (%d).", ret); data_len += ret; +#if CONFIG_ESP_TASK_WDT_EN + extern bool g_twdt_isr; + if (g_twdt_isr) { + ret = elf_add_wdt_panic_details(self); + if (ret <= 0) { return ret; } + data_len += ret; + } +#endif + if (g_panic_abort_details && strlen(g_panic_abort_details) > 0) { ret = elf_add_note(self, - "ESP_PANIC_DETAILS", - ELF_ESP_CORE_DUMP_PANIC_DETAILS_TYPE, - g_panic_abort_details, - strlen(g_panic_abort_details)); - ELF_CHECK_ERR((ret > 0), ret, "Panic details note write failed. Returned (%d).", ret); + ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME, + ELF_ESP_CORE_DUMP_PANIC_DETAILS_TYPE, + g_panic_abort_details, + strlen(g_panic_abort_details)); + ELF_CHECK_ERR((ret > 0), ret, "Panic details note write failed. Returned (%d).", ret); data_len += ret; } @@ -760,8 +853,7 @@ static void esp_core_dump_parse_note_section(uint8_t *coredump_data, elf_note_co break; } } - consumed_note_sz += note->n_namesz + note->n_descsz + sizeof(elf_note); - ALIGN(4, consumed_note_sz); + consumed_note_sz += ALIGN_UP(note->n_namesz + note->n_descsz + sizeof(elf_note), 4); } } } diff --git a/tools/test_apps/system/panic/pytest_panic.py b/tools/test_apps/system/panic/pytest_panic.py index 7b87fca39a..8ef653941d 100644 --- a/tools/test_apps/system/panic/pytest_panic.py +++ b/tools/test_apps/system/panic/pytest_panic.py @@ -106,10 +106,19 @@ def test_task_wdt_cpu0(dut: PanicTestDut, config: str, test_func_name: str) -> N dut.expect_elf_sha256() dut.expect_none('Guru Meditation') + coredump_pattern = (PANIC_ABORT_PREFIX + + 'Task watchdog got triggered. ' + 'The following tasks/users did not reset the watchdog in time:\n - ') + if dut.is_multi_core: + coredump_pattern += 'IDLE0 (CPU 0)' + else: + coredump_pattern += 'IDLE (CPU 0)' + common_test( dut, config, expected_backtrace=get_default_backtrace(test_func_name), + expected_coredump=[coredump_pattern] ) @@ -134,10 +143,14 @@ def test_task_wdt_cpu1(dut: PanicTestDut, config: str, test_func_name: str) -> N dut.expect_elf_sha256() dut.expect_none('Guru Meditation') + coredump_pattern = (PANIC_ABORT_PREFIX + + 'Task watchdog got triggered. ' + 'The following tasks/users did not reset the watchdog in time:\n - IDLE1 (CPU 1)') common_test( dut, config, expected_backtrace=expected_backtrace, + expected_coredump=[coredump_pattern] ) @@ -166,10 +179,14 @@ def test_task_wdt_both_cpus(dut: PanicTestDut, config: str, test_func_name: str) dut.expect_elf_sha256() dut.expect_none('Guru Meditation') + coredump_pattern = (PANIC_ABORT_PREFIX + + 'Task watchdog got triggered. ' + 'The following tasks/users did not reset the watchdog in time:\n - IDLE1 (CPU 1)\n - IDLE0 (CPU 0)') common_test( dut, config, expected_backtrace=expected_backtrace, + expected_coredump=[coredump_pattern] ) diff --git a/tools/test_apps/system/panic/test_panic_util/panic_dut.py b/tools/test_apps/system/panic/test_panic_util/panic_dut.py index beb7965d6b..0a01c20f04 100644 --- a/tools/test_apps/system/panic/test_panic_util/panic_dut.py +++ b/tools/test_apps/system/panic/test_panic_util/panic_dut.py @@ -1,14 +1,23 @@ -# SPDX-FileCopyrightText: 2022-2023 Espressif Systems (Shanghai) CO LTD +# SPDX-FileCopyrightText: 2022-2024 Espressif Systems (Shanghai) CO LTD # SPDX-License-Identifier: Unlicense OR CC0-1.0 import logging import os import re import subprocess import sys -from typing import Any, Dict, List, Optional, TextIO, Union +from typing import Any +from typing import Dict +from typing import List +from typing import Optional +from typing import TextIO +from typing import Union import pexpect -from panic_utils import NoGdbProcessError, attach_logger, quote_string, sha256, verify_valid_gdb_subprocess +from panic_utils import attach_logger +from panic_utils import NoGdbProcessError +from panic_utils import quote_string +from panic_utils import sha256 +from panic_utils import verify_valid_gdb_subprocess from pygdbmi.gdbcontroller import GdbController from pytest_embedded_idf.app import IdfApp from pytest_embedded_idf.dut import IdfDut @@ -51,6 +60,10 @@ class PanicTestDut(IdfDut): def is_xtensa(self) -> bool: return self.target in self.XTENSA_TARGETS + @property + def is_multi_core(self) -> bool: + return self.target in ['esp32', 'esp32s3'] + def run_test_func(self, test_func_name: str) -> None: self.expect_exact('Enter test name:') self.write(test_func_name)