Merge branch 'save_twdt_to_coredump_v5.2' into 'release/v5.2'

feat(coredump): save twdt panic output to coredump elf file (v5.2)

See merge request espressif/esp-idf!27756
This commit is contained in:
Jiang Jiang Jian 2023-12-11 14:51:33 +08:00
commit 499625be33
5 changed files with 257 additions and 96 deletions

View File

@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: 2015-2022 Espressif Systems (Shanghai) CO LTD
* SPDX-FileCopyrightText: 2015-2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
@ -179,6 +179,31 @@ esp_err_t esp_task_wdt_status(TaskHandle_t task_handle);
*/
void __attribute__((weak)) esp_task_wdt_isr_user_handler(void);
typedef void (*task_wdt_msg_handler)(void *opaque, const char *msg);
/**
* @brief Prints or retrieves information about tasks/users that triggered the Task Watchdog Timeout.
*
* This function provides various operations to handle tasks/users that did not reset the Task Watchdog in time.
* It can print detailed information about these tasks/users, such as their names, associated CPUs, and whether they have been reset.
* Additionally, it can retrieve the total length of the printed information or the CPU affinity of the failing tasks.
*
* @param[in] msg_handler Optional message handler function that will be called for each printed line.
* @param[in] opaque Optional pointer to opaque data that will be passed to the message handler function.
* @param[out] cpus_fail Optional pointer to an integer where the CPU affinity of the failing tasks will be stored.
*
* @return
* - ESP_OK: The function executed successfully.
* - ESP_FAIL: No triggered tasks were found, and thus no information was printed or retrieved.
*
* @note
* - If `msg_handler` is not provided, the information will be printed to console using ESP_EARLY_LOGE.
* - If `msg_handler` is provided, the function will send the printed information to the provided message handler function.
* - If `cpus_fail` is provided, the function will store the CPU affinity of the failing tasks in the provided integer.
* - During the execution of this function, logging is allowed in critical sections, as TWDT timeouts are considered fatal errors.
*/
esp_err_t esp_task_wdt_print_triggered_tasks(task_wdt_msg_handler msg_handler, void *opaque, int *cpus_fail);
#ifdef __cplusplus
}
#endif

View File

@ -5,6 +5,7 @@
*/
#include <stdint.h>
#include <string.h>
#include <stdbool.h>
#include <stdio.h>
#include <sys/queue.h>
@ -337,6 +338,32 @@ static void subscribe_idle(uint32_t core_mask)
*
*/
static UBaseType_t get_task_affinity(const TaskHandle_t xTask)
{
if (xTask == NULL) {
/* User entry, we cannot predict on which core it is scheduled to run,
* so let's mark all cores as failing */
#if configNUM_CORES > 1
return BIT(1) | BIT(0);
#else
return BIT(0);
#endif
}
#if CONFIG_FREERTOS_SMP
#if configNUM_CORES > 1
return vTaskCoreAffinityGet(xTask);
#else
return BIT(0);
#endif
#else
BaseType_t task_affinity = xTaskGetCoreID(xTask);
if (task_affinity == 0 || task_affinity == 1) {
return BIT(task_affinity);
}
return BIT(1) | BIT(0);
#endif
}
/**
* Function simulating an abort coming from the interrupted task of the current
@ -456,65 +483,17 @@ static void task_wdt_isr(void *arg)
portENTER_CRITICAL_ISR(&spinlock);
esp_task_wdt_impl_timeout_triggered(p_twdt_obj->impl_ctx);
// If there are no entries, there's nothing to do.
if (SLIST_EMPTY(&p_twdt_obj->entries_slist)) {
portEXIT_CRITICAL_ISR(&spinlock);
return;
}
// Find what entries triggered the TWDT timeout (i.e., which entries have not been reset)
/*
Note: We are currently in a critical section, thus under normal circumstances, logging should not be allowed.
However, TWDT timeouts count as fatal errors, thus reporting the fatal error is considered more important than
minimizing interrupt latency. Thus we allow logging in critical sections in this narrow case.
*/
ESP_EARLY_LOGE(TAG, "Task watchdog got triggered. The following tasks/users did not reset the watchdog in time:");
twdt_entry_t *entry;
/* Keep a bitmap of CPU cores having tasks that have not reset TWDT.
* Bit 0 represents core 0, bit 1 represents core 1, and so on. */
int cpus_fail = 0;
bool panic = p_twdt_obj->panic;
SLIST_FOREACH(entry, &p_twdt_obj->entries_slist, slist_entry) {
if (!entry->has_reset) {
if (entry->task_handle) {
#if CONFIG_FREERTOS_SMP
#if configNUM_CORES > 1
// Log the task's name and its affinity
const UBaseType_t affinity = vTaskCoreAffinityGet(entry->task_handle);
ESP_EARLY_LOGE(TAG, " - %s (0x%x)", pcTaskGetName(entry->task_handle), affinity);
cpus_fail |= affinity;
#else // configNUM_CORES > 1
// Log the task's name
ESP_EARLY_LOGE(TAG, " - %s", pcTaskGetName(entry->task_handle));
cpus_fail |= BIT(0);
#endif // configNUM_CORES > 1
#else // CONFIG_FREERTOS_SMP
BaseType_t task_affinity = xTaskGetAffinity(entry->task_handle);
const char *cpu;
if (task_affinity == 0) {
cpu = DRAM_STR("CPU 0");
cpus_fail |= BIT(0);
} else if (task_affinity == 1) {
cpu = DRAM_STR("CPU 1");
cpus_fail |= BIT(1);
} else {
cpu = DRAM_STR("CPU 0/1");
cpus_fail |= BIT(1) | BIT(0);
}
ESP_EARLY_LOGE(TAG, " - %s (%s)", pcTaskGetName(entry->task_handle), cpu);
#endif // CONFIG_FREERTOS_SMP
} else {
/* User entry, we cannot predict on which core it is scheduled to run,
* so let's mark all cores as failing */
#if configNUM_CORES > 1
cpus_fail = BIT(1) | BIT(0);
#else // configNUM_CORES > 1
cpus_fail = BIT(0);
#endif // configNUM_CORES > 1
ESP_EARLY_LOGE(TAG, " - %s", entry->user_name);
}
}
}
if (esp_task_wdt_print_triggered_tasks(NULL, NULL, &cpus_fail) != ESP_OK) {
// If there are no entries, there's nothing to do.
portEXIT_CRITICAL_ISR(&spinlock);
return;
}
ESP_EARLY_LOGE(TAG, "%s", DRAM_STR("Tasks currently running:"));
for (int x = 0; x < portNUM_PROCESSORS; x++) {
ESP_EARLY_LOGE(TAG, "CPU %d: %s", x, pcTaskGetName(xTaskGetCurrentTaskHandleForCPU(x)));
@ -806,3 +785,47 @@ esp_err_t esp_task_wdt_status(TaskHandle_t task_handle)
return ret;
}
esp_err_t esp_task_wdt_print_triggered_tasks(task_wdt_msg_handler msg_handler, void *opaque, int *cpus_fail)
{
if (SLIST_EMPTY(&p_twdt_obj->entries_slist)) {
return ESP_FAIL;
}
twdt_entry_t *entry;
const char *caption = "Task watchdog got triggered. "
"The following tasks/users did not reset the watchdog in time:";
if (msg_handler == NULL) {
ESP_EARLY_LOGE(TAG, "%s", caption);
} else {
msg_handler(opaque, caption);
}
// Find what entries triggered the TWDT timeout (i.e., which entries have not been reset)
SLIST_FOREACH(entry, &p_twdt_obj->entries_slist, slist_entry) {
if (!entry->has_reset) {
const char *cpu;
const char *name = entry->task_handle ? pcTaskGetName(entry->task_handle) : entry->user_name;
const UBaseType_t affinity = get_task_affinity(entry->task_handle);
if (cpus_fail) {
*cpus_fail |= affinity;
}
if (affinity == BIT(0)) {
cpu = " (CPU 0)";
} else if (affinity == BIT(1)) {
cpu = " (CPU 1)";
} else {
cpu = " (CPU 0/1)";
}
if (msg_handler == NULL) {
ESP_EARLY_LOGE(TAG, " - %s%s", name, cpu);
} else {
msg_handler(opaque, "\n - ");
msg_handler(opaque, name);
msg_handler(opaque, cpu);
}
}
}
return ESP_OK;
}

View File

@ -14,6 +14,7 @@
#include "esp_core_dump_port_impl.h"
#include "esp_core_dump_common.h"
#include "hal/efuse_hal.h"
#include "esp_task_wdt.h"
#ifdef CONFIG_ESP_COREDUMP_DATA_FORMAT_ELF
#include <sys/param.h> // for the MIN macro
@ -35,6 +36,8 @@
#define ELF_NOTE_NAME_MAX_SIZE 32
#define ELF_APP_SHA256_SIZE 66
#define ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME "ESP_PANIC_DETAILS"
#define ELF_CHECK_ERR(a, ret_val, str, ...) \
if (!(a)) { \
ESP_COREDUMP_LOGE("%s(%u): " str, __FUNCTION__, __LINE__, ##__VA_ARGS__); \
@ -82,18 +85,23 @@ typedef struct _core_dump_elf_t
uint32_t elf_next_data_offset;
uint16_t segs_count;
core_dump_write_config_t * write_cfg;
uint32_t note_data_size; /* can be used where static storage needed */
} core_dump_elf_t;
typedef struct {
core_dump_elf_t *self;
uint32_t total_size;
bool size_only;
} core_dump_elf_opaque_t;
// Represents lightweight implementation to save core dump data into ELF formatted binary
#define ALIGN(b, var) var = align(b, var)
#if CONFIG_ESP_COREDUMP_DATA_FORMAT_ELF
static inline uint32_t align(uint32_t width, uint32_t in)
{
return (in + (width - 1)) & -width;
}
#ifdef ALIGN_UP
#undef ALIGN_UP
#endif
#define ALIGN_UP(x, a) (((x) + (a) - 1) & ~((a) - 1))
// Builds elf header and check all data offsets
static int elf_write_file_header(core_dump_elf_t *self, uint32_t seg_count)
@ -158,13 +166,11 @@ static int elf_add_segment(core_dump_elf_t *self,
{
esp_err_t err = ESP_FAIL;
elf_phdr seg_hdr = { 0 };
int data_len = data_sz;
int data_len = ALIGN_UP(data_sz, 4);
ELF_CHECK_ERR((data != NULL), ELF_PROC_ERR_OTHER,
"Invalid data for segment.");
ALIGN(4, data_len);
if (self->elf_stage == ELF_STAGE_CALC_SPACE) {
self->segs_count++;
return data_len + sizeof(elf_phdr);
@ -193,6 +199,29 @@ static int elf_add_segment(core_dump_elf_t *self,
return data_len;
}
static int elf_write_note_header(core_dump_elf_t *self,
const char* name, uint32_t name_len, uint32_t data_sz, uint32_t type)
{
// temporary aligned buffer for note name
static char name_buffer[ELF_NOTE_NAME_MAX_SIZE] = { 0 };
elf_note note_hdr = { 0 };
memcpy((void*)name_buffer, (void*)name, name_len);
note_hdr.n_namesz = name_len;
note_hdr.n_descsz = data_sz;
note_hdr.n_type = type;
// write note header
esp_err_t err = self->write_cfg->write(self->write_cfg->priv, &note_hdr, sizeof(note_hdr));
ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL,
"Write ELF note header failure (%d)", err);
// write note name
err = self->write_cfg->write(self->write_cfg->priv, name_buffer, name_len);
ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL,
"Write ELF note name failure (%d)", err);
return err;
}
static int elf_write_note(core_dump_elf_t *self,
const char* name,
uint32_t type,
@ -200,38 +229,34 @@ static int elf_write_note(core_dump_elf_t *self,
uint32_t data_sz)
{
esp_err_t err = ESP_FAIL;
// temporary buffer for note name
static char name_buffer[ELF_NOTE_NAME_MAX_SIZE] = { 0 };
elf_note note_hdr = { 0 };
uint32_t name_len = strlen(name) + 1; // get name length including terminator
uint32_t data_len = data_sz;
uint32_t name_len = ALIGN_UP(strlen(name) + 1, 4); // get name length including terminator
uint32_t data_len = ALIGN_UP(data_sz, 4);
ELF_CHECK_ERR(data, ELF_PROC_ERR_OTHER,
"Invalid data pointer %x.", (uint32_t)data);
ELF_CHECK_ERR((name_len <= ELF_NOTE_NAME_MAX_SIZE), 0,
"Segment note name is too long %d.", name_len);
ALIGN(4, data_len);
ALIGN(4, name_len);
uint32_t note_size = name_len + data_len + sizeof(elf_note);
ALIGN(4, note_size);
uint32_t note_size = ALIGN_UP(name_len + data_len + sizeof(elf_note), 4);
// write segment data during second pass
if (self->elf_stage == ELF_STAGE_PLACE_DATA) {
memcpy((void*)name_buffer, (void*)name, name_len);
note_hdr.n_namesz = name_len;
note_hdr.n_descsz = data_sz;
note_hdr.n_type = type;
// write note header
err = self->write_cfg->write(self->write_cfg->priv, (void*)&note_hdr, sizeof(note_hdr));
ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL,
"Write ELF note header failure (%d)", err);
// write note name
err = self->write_cfg->write(self->write_cfg->priv, (void*)name_buffer, name_len);
ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL,
"Write ELF note name failure (%d)", err);
// write note data
err = self->write_cfg->write(self->write_cfg->priv, (void*)data, data_len);
ELF_CHECK_ERR(data, ELF_PROC_ERR_OTHER, "Invalid data pointer %x.", (uint32_t)data);
err = elf_write_note_header(self, name, name_len, data_sz, type);
if (err != ESP_OK) {
return err;
}
// note data must be aligned in memory. we write aligned byte structures and panic details in strings,
// which might not be aligned by default. Therefore, we need to verify alignment and add padding if necessary.
err = self->write_cfg->write(self->write_cfg->priv, data, data_sz);
if (err == ESP_OK) {
int pad_size = data_len - data_sz;
if (pad_size != 0) {
uint8_t pad_bytes[3] = {0};
ESP_COREDUMP_LOG_PROCESS("Core dump note data needs %d bytes padding", pad_size);
err = self->write_cfg->write(self->write_cfg->priv, pad_bytes, pad_size);
}
}
ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL,
"Write ELF note data failure (%d)", err);
ESP_COREDUMP_LOG_PROCESS("Add note size=%d, start_off=0x%x",
@ -495,6 +520,65 @@ static int elf_write_core_dump_user_data(core_dump_elf_t *self)
return total_sz;
}
#if CONFIG_ESP_TASK_WDT_EN
static void elf_write_core_dump_note_cb(void *opaque, const char *data)
{
core_dump_elf_opaque_t *param = opaque;
core_dump_elf_t *self = param->self;
const size_t data_len = strlen(data);
ESP_COREDUMP_LOG_PROCESS("Core dump note cb data_len:(%d)", data_len);
param->total_size += data_len;
if (!param->size_only) {
esp_err_t err = self->write_cfg->write(self->write_cfg->priv, (void *)data, data_len);
if (err != ESP_OK) {
param->total_size = 0;
}
}
}
static int elf_add_wdt_panic_details(core_dump_elf_t *self)
{
uint32_t name_len = ALIGN_UP(sizeof(ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME), 4);
core_dump_elf_opaque_t param = {
.self = self,
.total_size = 0,
.size_only = false,
};
if (self->elf_stage == ELF_STAGE_CALC_SPACE) {
param.size_only = true;
esp_task_wdt_print_triggered_tasks(elf_write_core_dump_note_cb, &param, NULL);
ELF_CHECK_ERR((param.total_size > 0), ELF_PROC_ERR_OTHER, "wdt panic message len is zero!");
self->note_data_size = param.total_size;
} else if (self->elf_stage == ELF_STAGE_PLACE_DATA) {
esp_err_t err = elf_write_note_header(self,
ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME,
name_len,
self->note_data_size,
ELF_ESP_CORE_DUMP_PANIC_DETAILS_TYPE);
if (err != ESP_OK) {
return err;
}
esp_task_wdt_print_triggered_tasks(elf_write_core_dump_note_cb, &param, NULL);
ELF_CHECK_ERR((param.total_size > 0), ELF_PROC_ERR_WRITE_FAIL, "Write ELF note data failure (%d)", err);
const uint32_t mod = self->note_data_size & 3;
if (mod != 0) {
uint8_t pad_bytes[3] = {0};
uint32_t pad_size = 4 - mod;
ESP_COREDUMP_LOG_PROCESS("Core dump note needs %d bytes padding", pad_size);
err = self->write_cfg->write(self->write_cfg->priv, pad_bytes, pad_size);
ELF_CHECK_ERR((err == ESP_OK), ELF_PROC_ERR_WRITE_FAIL, "Write ELF note padding failure (%d)", err);
}
}
return ALIGN_UP(name_len + ALIGN_UP(self->note_data_size, 4) + sizeof(elf_note), 4);
}
#endif //CONFIG_ESP_TASK_WDT_EN
static int elf_write_core_dump_info(core_dump_elf_t *self)
{
void *extra_info = NULL;
@ -530,13 +614,22 @@ static int elf_write_core_dump_info(core_dump_elf_t *self)
ELF_CHECK_ERR((ret > 0), ret, "Extra info note write failed. Returned (%d).", ret);
data_len += ret;
#if CONFIG_ESP_TASK_WDT_EN
extern bool g_twdt_isr;
if (g_twdt_isr) {
ret = elf_add_wdt_panic_details(self);
if (ret <= 0) { return ret; }
data_len += ret;
}
#endif
if (g_panic_abort_details && strlen(g_panic_abort_details) > 0) {
ret = elf_add_note(self,
"ESP_PANIC_DETAILS",
ELF_ESP_CORE_DUMP_PANIC_DETAILS_TYPE,
g_panic_abort_details,
strlen(g_panic_abort_details));
ELF_CHECK_ERR((ret > 0), ret, "Panic details note write failed. Returned (%d).", ret);
ELF_ESP_CORE_DUMP_PANIC_DETAILS_NOTE_NAME,
ELF_ESP_CORE_DUMP_PANIC_DETAILS_TYPE,
g_panic_abort_details,
strlen(g_panic_abort_details));
ELF_CHECK_ERR((ret > 0), ret, "Panic details note write failed. Returned (%d).", ret);
data_len += ret;
}
@ -769,8 +862,7 @@ static void esp_core_dump_parse_note_section(uint8_t *coredump_data, elf_note_co
break;
}
}
consumed_note_sz += note->n_namesz + note->n_descsz + sizeof(elf_note);
ALIGN(4, consumed_note_sz);
consumed_note_sz += ALIGN_UP(note->n_namesz + note->n_descsz + sizeof(elf_note), 4);
}
}
}

View File

@ -123,10 +123,19 @@ def test_task_wdt_cpu0(dut: PanicTestDut, config: str, test_func_name: str) -> N
dut.expect_elf_sha256()
dut.expect_none('Guru Meditation')
coredump_pattern = (PANIC_ABORT_PREFIX +
'Task watchdog got triggered. '
'The following tasks/users did not reset the watchdog in time:\n - ')
if dut.is_multi_core:
coredump_pattern += 'IDLE0 (CPU 0)'
else:
coredump_pattern += 'IDLE (CPU 0)'
common_test(
dut,
config,
expected_backtrace=get_default_backtrace(test_func_name),
expected_coredump=[coredump_pattern]
)
@ -151,10 +160,14 @@ def test_task_wdt_cpu1(dut: PanicTestDut, config: str, test_func_name: str) -> N
dut.expect_elf_sha256()
dut.expect_none('Guru Meditation')
coredump_pattern = (PANIC_ABORT_PREFIX +
'Task watchdog got triggered. '
'The following tasks/users did not reset the watchdog in time:\n - IDLE1 (CPU 1)')
common_test(
dut,
config,
expected_backtrace=expected_backtrace,
expected_coredump=[coredump_pattern]
)
@ -183,10 +196,14 @@ def test_task_wdt_both_cpus(dut: PanicTestDut, config: str, test_func_name: str)
dut.expect_elf_sha256()
dut.expect_none('Guru Meditation')
coredump_pattern = (PANIC_ABORT_PREFIX +
'Task watchdog got triggered. '
'The following tasks/users did not reset the watchdog in time:\n - IDLE1 (CPU 1)\n - IDLE0 (CPU 0)')
common_test(
dut,
config,
expected_backtrace=expected_backtrace,
expected_coredump=[coredump_pattern]
)

View File

@ -51,6 +51,10 @@ class PanicTestDut(IdfDut):
def is_xtensa(self) -> bool:
return self.target in self.XTENSA_TARGETS
@property
def is_multi_core(self) -> bool:
return self.target in ['esp32', 'esp32s3']
def run_test_func(self, test_func_name: str) -> None:
self.expect_exact('Enter test name:')
self.write(test_func_name)