From 256ef127ddde51de5eb07659656bbbaf62a16e4d Mon Sep 17 00:00:00 2001 From: Armando Date: Thu, 12 Sep 2024 09:52:52 +0800 Subject: [PATCH 1/2] fix(linker): fixed extern linker symbol type from int to char --- components/esp_psram/mmu_psram_flash.c | 16 ++++++++++++---- components/esp_psram/mmu_psram_flash_v2.c | 16 ++++++++++++---- components/esp_system/port/cpu_start.c | 17 +++++++++++++---- components/spi_flash/flash_mmap.c | 8 ++++---- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/components/esp_psram/mmu_psram_flash.c b/components/esp_psram/mmu_psram_flash.c index 64baaaa36c..bb9288410a 100644 --- a/components/esp_psram/mmu_psram_flash.c +++ b/components/esp_psram/mmu_psram_flash.c @@ -136,10 +136,18 @@ esp_err_t mmu_config_psram_rodata_segment(uint32_t start_page, uint32_t psram_si /*---------------------------------------------------------------------------- Part 2 APIs (See @Backgrounds on top of this file) -------------------------------------------------------------------------------*/ -extern int _instruction_reserved_start; -extern int _instruction_reserved_end; -extern int _rodata_reserved_start; -extern int _rodata_reserved_end; +/** + * If using `int`, then for CLANG, with enabled optimization when inlined function is provided with the address of external symbol, the two least bits of the constant used inside that function get cleared. + * Optimizer assumes that address of external symbol should be aligned to 4-bytes and therefore aligns constant value used for bitwise AND operation with that address. + * + * This means `extern int _instruction_reserved_start;` can be unaligned to 4 bytes, whereas using `char` can solve this issue. + * + * As we only use these symbol address, we declare them as `char` here + */ +extern char _instruction_reserved_start; +extern char _instruction_reserved_end; +extern char _rodata_reserved_start; +extern char _rodata_reserved_end; //------------------------------------Copy Flash .text to PSRAM-------------------------------------// #if CONFIG_SPIRAM_FETCH_INSTRUCTIONS diff --git a/components/esp_psram/mmu_psram_flash_v2.c b/components/esp_psram/mmu_psram_flash_v2.c index be78092322..cf5cd0e2f0 100644 --- a/components/esp_psram/mmu_psram_flash_v2.c +++ b/components/esp_psram/mmu_psram_flash_v2.c @@ -28,10 +28,18 @@ #define ALIGN_UP_BY(num, align) (((num) + ((align) - 1)) & ~((align) - 1)) #define ALIGN_DOWN_BY(num, align) ((num) & (~((align) - 1))) -extern int _instruction_reserved_start; -extern int _instruction_reserved_end; -extern int _rodata_reserved_start; -extern int _rodata_reserved_end; +/** + * If using `int`, then for CLANG, with enabled optimization when inlined function is provided with the address of external symbol, the two least bits of the constant used inside that function get cleared. + * Optimizer assumes that address of external symbol should be aligned to 4-bytes and therefore aligns constant value used for bitwise AND operation with that address. + * + * This means `extern int _instruction_reserved_start;` can be unaligned to 4 bytes, whereas using `char` can solve this issue. + * + * As we only use these symbol address, we declare them as `char` here + */ +extern char _instruction_reserved_start; +extern char _instruction_reserved_end; +extern char _rodata_reserved_start; +extern char _rodata_reserved_end; const static char *TAG = "mmu_psram"; static uint32_t s_irom_vaddr_start; diff --git a/components/esp_system/port/cpu_start.c b/components/esp_system/port/cpu_start.c index 0edd50c156..5d4e92ffae 100644 --- a/components/esp_system/port/cpu_start.c +++ b/components/esp_system/port/cpu_start.c @@ -136,10 +136,19 @@ extern int _rtc_bss_end; extern int _bss_bt_start; extern int _bss_bt_end; #endif // CONFIG_BT_LE_RELEASE_IRAM_SUPPORTED -extern int _instruction_reserved_start; -extern int _instruction_reserved_end; -extern int _rodata_reserved_start; -extern int _rodata_reserved_end; + +/** + * If using `int`, then for CLANG, with enabled optimization when inlined function is provided with the address of external symbol, the two least bits of the constant used inside that function get cleared. + * Optimizer assumes that address of external symbol should be aligned to 4-bytes and therefore aligns constant value used for bitwise AND operation with that address. + * + * This means `extern int _instruction_reserved_start;` can be unaligned to 4 bytes, whereas using `char` can solve this issue. + * + * As we only use these symbol address, we declare them as `char` here + */ +extern char _instruction_reserved_start; +extern char _instruction_reserved_end; +extern char _rodata_reserved_start; +extern char _rodata_reserved_end; extern int _vector_table; #if SOC_INT_CLIC_SUPPORTED diff --git a/components/spi_flash/flash_mmap.c b/components/spi_flash/flash_mmap.c index 28e3f40959..19d347889a 100644 --- a/components/spi_flash/flash_mmap.c +++ b/components/spi_flash/flash_mmap.c @@ -36,13 +36,13 @@ #include "spi_flash_mmap.h" #if CONFIG_SPIRAM_FETCH_INSTRUCTIONS -extern int _instruction_reserved_start; -extern int _instruction_reserved_end; +extern char _instruction_reserved_start; +extern char _instruction_reserved_end; #endif #if CONFIG_SPIRAM_RODATA -extern int _rodata_reserved_start; -extern int _rodata_reserved_end; +extern char _rodata_reserved_start; +extern char _rodata_reserved_end; #endif #if !CONFIG_SPI_FLASH_ROM_IMPL From 6c4fb48783f1bd01ab254e9e3da1a820cc3e3d33 Mon Sep 17 00:00:00 2001 From: Armando Date: Thu, 12 Sep 2024 13:09:01 +0800 Subject: [PATCH 2/2] doc(psram): clarify why psram speed is faster than flash --- .../soc/esp32c5/include/soc/Kconfig.soc_caps.in | 4 ++++ components/soc/esp32c5/include/soc/soc_caps.h | 3 +++ .../soc/esp32c61/include/soc/Kconfig.soc_caps.in | 4 ++++ components/soc/esp32c61/include/soc/soc_caps.h | 3 +++ docs/en/api-guides/external-ram.rst | 14 +++++++++++--- docs/zh_CN/api-guides/external-ram.rst | 12 +++++++++--- examples/system/xip_from_psram/README.md | 4 ++-- .../main/xip_from_psram_example_main.c | 12 +++++++++--- .../system/xip_from_psram/pytest_xip_from_psram.py | 2 ++ 9 files changed, 47 insertions(+), 11 deletions(-) diff --git a/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in b/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in index 8740fab913..b6deae6d49 100644 --- a/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in @@ -1007,6 +1007,10 @@ config SOC_SPI_MAX_PRE_DIVIDER int default 16 +config SOC_SPIRAM_XIP_SUPPORTED + bool + default y + config SOC_SPI_MEM_SUPPORT_AUTO_WAIT_IDLE bool default y diff --git a/components/soc/esp32c5/include/soc/soc_caps.h b/components/soc/esp32c5/include/soc/soc_caps.h index c4fd2d2547..6f2418d8c7 100644 --- a/components/soc/esp32c5/include/soc/soc_caps.h +++ b/components/soc/esp32c5/include/soc/soc_caps.h @@ -426,6 +426,9 @@ #define SOC_MEMSPI_IS_INDEPENDENT 1 #define SOC_SPI_MAX_PRE_DIVIDER 16 +/*-------------------------- SPIRAM CAPS ----------------------------------------*/ +#define SOC_SPIRAM_XIP_SUPPORTED 1 + /*-------------------------- SPI MEM CAPS ---------------------------------------*/ #define SOC_SPI_MEM_SUPPORT_AUTO_WAIT_IDLE (1) #define SOC_SPI_MEM_SUPPORT_AUTO_SUSPEND (1) diff --git a/components/soc/esp32c61/include/soc/Kconfig.soc_caps.in b/components/soc/esp32c61/include/soc/Kconfig.soc_caps.in index e825165289..5288c73e64 100644 --- a/components/soc/esp32c61/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32c61/include/soc/Kconfig.soc_caps.in @@ -507,6 +507,10 @@ config SOC_MEMSPI_IS_INDEPENDENT bool default y +config SOC_SPIRAM_XIP_SUPPORTED + bool + default y + config SOC_SPI_MEM_SUPPORT_AUTO_WAIT_IDLE bool default y diff --git a/components/soc/esp32c61/include/soc/soc_caps.h b/components/soc/esp32c61/include/soc/soc_caps.h index c56d34abbb..e02eced912 100644 --- a/components/soc/esp32c61/include/soc/soc_caps.h +++ b/components/soc/esp32c61/include/soc/soc_caps.h @@ -309,6 +309,9 @@ #define SOC_SPI_PERIPH_SUPPORT_MULTILINE_MODE(host_id) ({(void)host_id; 1;}) #define SOC_MEMSPI_IS_INDEPENDENT 1 +/*-------------------------- SPIRAM CAPS ----------------------------------------*/ +#define SOC_SPIRAM_XIP_SUPPORTED 1 + /*-------------------------- SPI MEM CAPS ---------------------------------------*/ #define SOC_SPI_MEM_SUPPORT_AUTO_WAIT_IDLE (1) #define SOC_SPI_MEM_SUPPORT_AUTO_SUSPEND (1) diff --git a/docs/en/api-guides/external-ram.rst b/docs/en/api-guides/external-ram.rst index 6648ac83ad..f8cebef14d 100644 --- a/docs/en/api-guides/external-ram.rst +++ b/docs/en/api-guides/external-ram.rst @@ -167,13 +167,13 @@ By applying the macro ``EXT_RAM_NOINIT_ATTR``, data could be moved from the inte The benefits of XiP from PSRAM is: - - PSRAM access speed is faster than Flash access. So the performance is better. + - PSRAM access speed may be faster than Flash access, so the overall application performance may be better. For example, if the PSRAM is an Octal mode (8-line-PSRAM) and is configured to 80 MHz, then it is faster than a Quad flash (4-line-flash) which is configured to 80 MHz. - The cache will not be disabled during an SPI1 flash operation, thus optimizing the code execution performance during SPI1 flash operations. For ISRs, ISR callbacks and data which might be accessed during this period, you do not need to place them in internal RAM, thus internal RAM usage can be optimized. This feature is useful for high throughput peripheral involved applications to improve the performance during SPI1 flash operations. :example:`system/xip_from_psram` demonstrates the usage of XiP from PSRAM, optimizing internal RAM usage and avoiding cache disabling during flash operations from user call (e.g., flash erase/read/write operations). - .. only:: esp32p4 + .. only:: not (esp32s2 or esp32s3) .. _external_ram_config_xip: @@ -182,7 +182,15 @@ By applying the macro ``EXT_RAM_NOINIT_ATTR``, data could be moved from the inte The :ref:`CONFIG_SPIRAM_XIP_FROM_PSRAM` option enables the executable in place (XiP) from PSRAM feature. With this option sections that are normally placed in flash, ``.text`` (for instructions) and ``.rodata`` (for read only data), will be loaded in PSRAM. - With this option enabled, the cache will not be disabled during an SPI1 flash operation, so code that requires executing during an SPI1 flash operation does not have to be placed in internal RAM. Because P4 flash and PSRAM are using two separate SPI buses, moving flash content to PSRAM will actually increase the load of the PSRAM MSPI bus, so the exact impact on performance will be dependent on your app usage of PSRAM. For example, as the PSRAM bus speed could be much faster than flash bus speed, if the instructions and data that are used to be in flash are not accessed very frequently, you might get better performance with this option enabled. We suggest doing performance profiling to determine if enabling this option. + With this option enabled, the cache will not be disabled during an SPI1 flash operation, so code that requires executing during an SPI1 flash operation does not have to be placed in internal RAM. + + .. only:: SOC_MMU_PER_EXT_MEM_TARGET + + Because {IDF_TARGET_NAME} flash and PSRAM are using two separate SPI buses, moving flash content to PSRAM will actually increase the load of the PSRAM MSPI bus, so the exact impact on performance will be dependent on your app usage of PSRAM. + + For example, as the PSRAM bus speed could be faster than flash bus speed (e.g., if the PSRAM is a HEX (16-line-PSRAM on ESP32P4) and is configured to 200 Mhz, then it is much faster than a Quad flash (4-line-flash) which is configured to 80 MHz.). + + If the instructions and data that are used to be in flash are not accessed very frequently, you should get better performance with this option enabled. We suggest doing performance profiling to determine how enabling this option will impact your system. Restrictions ============ diff --git a/docs/zh_CN/api-guides/external-ram.rst b/docs/zh_CN/api-guides/external-ram.rst index 559d44779a..92bfa9eafc 100644 --- a/docs/zh_CN/api-guides/external-ram.rst +++ b/docs/zh_CN/api-guides/external-ram.rst @@ -167,13 +167,13 @@ ESP-IDF 启动过程中,片外 RAM 被映射到数据虚拟地址空间,该 在 PSRAM 中直接执行代码的好处包括: - - PSRAM 访问速度快于 flash,因此性能更好。 + - PSRAM 访问速度可能快于 flash,因此性能更好。例如,如果使用的 PSRAM 是八线的,且被配置为 80 MHz,而 flash 是4线的,且被配置为 80 Mhz,那么 PSRAM 的访问速度是快于 flash 的。 - 在进行 SPI1 flash 操作期间,cache 仍然保持启用状态,这样可以优化代码执行性能。由于无需把中断服务程序 (ISR)、ISR 回调和在此期间可能被访问的数据放置在片上 RAM 中,片上 RAM 可用于其他用途,从而提高了使用效率。这个特性适用于需要处理大量数据的高吞吐量外设应用,能显著提高 SPI1 flash 操作期间的性能。 :example:`system/xip_from_psram` 演示了如何从 PSRAM 直接执行代码,从而优化内部 RAM 的使用,并避免用户调用 flash 操作(例如闪存擦除/读取/写入操作)时关闭 cache。 - .. only:: esp32p4 + .. only:: not (esp32s2 or esp32s3) .. _external_ram_config_xip: @@ -182,7 +182,13 @@ ESP-IDF 启动过程中,片外 RAM 被映射到数据虚拟地址空间,该 启用 :ref:`CONFIG_SPIRAM_XIP_FROM_PSRAM` 选项后能在 PSRAM 中直接执行代码。通常放置在 flash 中的段,如 ``.text`` 部分的数据(用于指令)和 ``.rodata`` 部分的数据(用于只读数据),将被加载到 PSRAM 中。 - 启用此选项后,SPI1 flash 操作期间 cache 保持启用状态,因此需要执行的代码在此期间不必放置在内部 RAM 中。由于 ESP32-P4 flash 和 PSRAM 使用两个独立的 SPI 总线,将 flash 内容移动到 PSRAM 实际上增加了 PSRAM MSPI 总线的负载,因此访问速度相对较慢。应用程序在运行过程中对 PSRAM 的使用会直接影响整体性能。因此,建议先进行性能分析以确定启用此选项是否会显著影响应用程序性能。 + 启用此选项后,SPI1 flash 操作期间 cache 保持启用状态,因此需要执行的代码在此期间不必放置在内部 RAM 中。 + + .. only:: SOC_MMU_PER_EXT_MEM_TARGET + + 由于 {IDF_TARGET_NAME} flash 和 PSRAM 使用两个独立的 SPI 总线,将 flash 内容移动到 PSRAM 实际上增加了 PSRAM MSPI 总线的负载, + + 例如,PSRAM 的访问速度可能快于 flash (比如在 ESP32-P4 上,选择的 PSRAM 是十六线的并将其配置为 200 MHz, 此时 PSRAM 的访问速度是远快于一颗被配置为 80 MHz 的四线 flash 芯片),如果这些之前在 flash 中被就地执行的指令和数据不是十分频繁地被访问,则使能这个选项会增加系统的性能。建议先进行性能分析以确定启用此选项是否会显著影响应用程序性能。 片外 RAM 使用限制 =================== diff --git a/examples/system/xip_from_psram/README.md b/examples/system/xip_from_psram/README.md index 8d54b7287f..a4612395dc 100644 --- a/examples/system/xip_from_psram/README.md +++ b/examples/system/xip_from_psram/README.md @@ -1,5 +1,5 @@ -| Supported Targets | ESP32-P4 | ESP32-S2 | ESP32-S3 | -| ----------------- | -------- | -------- | -------- | +| Supported Targets | ESP32-C5 | ESP32-C61 | ESP32-P4 | ESP32-S2 | ESP32-S3 | +| ----------------- | -------- | --------- | -------- | -------- | -------- | # XIP (Execute-In-Place) From PSRAM Example diff --git a/examples/system/xip_from_psram/main/xip_from_psram_example_main.c b/examples/system/xip_from_psram/main/xip_from_psram_example_main.c index a9c9639c42..c66b91c6e3 100644 --- a/examples/system/xip_from_psram/main/xip_from_psram_example_main.c +++ b/examples/system/xip_from_psram/main/xip_from_psram_example_main.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2022-2024 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Unlicense OR CC0-1.0 */ @@ -14,6 +14,12 @@ #include "esp_flash.h" #include "esp_timer.h" +#if CONFIG_IDF_TARGET_ESP32C5 || CONFIG_IDF_TARGET_ESP32C61 +#define EXAMPLE_TIMER_ALERT_TIME (1 * 3 * 1000) +#else +#define EXAMPLE_TIMER_ALERT_TIME (1 * 10 * 1000) +#endif + static void oneshot_timer_callback(void* arg); static void cb_in_psram(void); static void cb_in_iram(void); @@ -41,14 +47,14 @@ void app_main(void) ESP_LOGI(TAG, "found partition '%s' at offset 0x%"PRIx32" with size 0x%"PRIx32, part->label, part->address, part->size); ESP_ERROR_CHECK(esp_flash_erase_region(part->flash_chip, part->address, part->size)); - ESP_ERROR_CHECK(esp_timer_start_once(oneshot_timer, 1 * 10 * 1000)); + ESP_ERROR_CHECK(esp_timer_start_once(oneshot_timer, EXAMPLE_TIMER_ALERT_TIME)); ESP_ERROR_CHECK(esp_flash_erase_region(part->flash_chip, part->address, part->size)); ESP_LOGI(TAG, "callback(in PSRAM) response time: %d us", time_cb_end - time_cb_start); instructions_in_psram = false; - ESP_ERROR_CHECK(esp_timer_start_once(oneshot_timer, 1 * 10 * 1000)); + ESP_ERROR_CHECK(esp_timer_start_once(oneshot_timer, EXAMPLE_TIMER_ALERT_TIME)); ESP_ERROR_CHECK(esp_flash_erase_region(part->flash_chip, part->address, part->size)); ESP_LOGI(TAG, "callback(in IRAM) response time: %d us", time_cb_end - time_cb_start); diff --git a/examples/system/xip_from_psram/pytest_xip_from_psram.py b/examples/system/xip_from_psram/pytest_xip_from_psram.py index a9e9ed9107..53398620f1 100644 --- a/examples/system/xip_from_psram/pytest_xip_from_psram.py +++ b/examples/system/xip_from_psram/pytest_xip_from_psram.py @@ -7,6 +7,8 @@ from pytest_embedded.dut import Dut @pytest.mark.esp32s2 @pytest.mark.esp32s3 @pytest.mark.esp32p4 +@pytest.mark.esp32c5 +@pytest.mark.esp32c61 @pytest.mark.generic # in order to build the default sdkconfig(the CI won't build the sdkconfig.defaults if there is a sdkconfig.ci.xx) @pytest.mark.parametrize(