Merge branch 'feature/async_memcpy_rewrite' into 'master'

feat(async_memcpy): refactor driver code to support different DMA backends See merge request espressif/esp-idf!25050
2024-10-05 20:47:46 -04:00 · 2023-08-04 17:52:13 +08:00 · 2023-08-04 17:52:13 +08:00 · 55ea6b78a7
commit 55ea6b78a7
parent 025be6bbba 956ec54aed
18 changed files with 1165 additions and 771 deletions
--- a/components/esp_hw_support/CMakeLists.txt
+++ b/components/esp_hw_support/CMakeLists.txt
@ -60,16 +60,18 @@ if(NOT BOOTLOADER_BUILD)
    endif()

    if(CONFIG_SOC_GDMA_SUPPORTED)
-        list(APPEND srcs "dma/gdma.c" "dma/async_memcpy_impl_gdma.c")
-    endif()
-
-    if(CONFIG_SOC_CP_DMA_SUPPORTED)
-        list(APPEND srcs "dma/async_memcpy_impl_cp_dma.c")
+        list(APPEND srcs "dma/gdma.c")
    endif()

    if(CONFIG_SOC_ASYNC_MEMCPY_SUPPORTED)
        list(APPEND srcs "dma/esp_async_memcpy.c")
-    endif()
+        if(CONFIG_SOC_GDMA_SUPPORTED)
+            list(APPEND srcs "dma/async_memcpy_gdma.c")
+        endif() # CONFIG_SOC_GDMA_SUPPORTED
+        if(CONFIG_SOC_CP_DMA_SUPPORTED)
+            list(APPEND srcs "dma/async_memcpy_cp_dma.c")
+        endif() # CONFIG_SOC_CP_DMA_SUPPORTED
+    endif() # CONFIG_SOC_ASYNC_MEMCPY_SUPPORTED

    if(CONFIG_SOC_GDMA_SUPPORT_ETM)
        list(APPEND srcs "dma/gdma_etm.c")
--- a/components/esp_hw_support/dma/async_memcpy_cp_dma.c
+++ b/components/esp_hw_support/dma/async_memcpy_cp_dma.c
@ -0,0 +1,358 @@
+/*
+ * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <stdatomic.h>
+#include <sys/queue.h>
+#include <sys/param.h>
+#include "sdkconfig.h"
+#include "freertos/FreeRTOS.h"
+#include "soc/soc_caps.h"
+#include "soc/interrupts.h"
+#include "esp_log.h"
+#include "esp_check.h"
+#include "esp_attr.h"
+#include "esp_err.h"
+#include "esp_intr_alloc.h"
+#include "esp_memory_utils.h"
+#include "esp_async_memcpy.h"
+#include "esp_async_memcpy_priv.h"
+#include "hal/cp_dma_hal.h"
+#include "hal/cp_dma_ll.h"
+#include "hal/dma_types.h"
+
+static const char *TAG = "async_mcp.cpdma";
+
+/// @brief Transaction object for async memcpy
+/// @note - the DMA descriptors to be 4-byte aligned
+/// @note - The DMA descriptor link list is allocated dynamically from DMA-able memory
+/// @note - Because of the eof_node, the transaction object should also be allocated from DMA-able memory
+typedef struct async_memcpy_transaction_t {
+    dma_descriptor_align4_t eof_node;      // this is the DMA node which act as the EOF descriptor (RX path only)
+    dma_descriptor_align4_t *tx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size
+    dma_descriptor_align4_t *rx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size
+    intptr_t tx_start_desc_addr; // TX start descriptor address
+    intptr_t rx_start_desc_addr; // RX start descriptor address
+    async_memcpy_isr_cb_t cb;    // user callback
+    void *cb_args;               // user callback args
+    STAILQ_ENTRY(async_memcpy_transaction_t) idle_queue_entry;  // Entry for the idle queue
+    STAILQ_ENTRY(async_memcpy_transaction_t) ready_queue_entry; // Entry for the ready queue
+} async_memcpy_transaction_t;
+
+/// @brief Context of async memcpy driver
+/// @note - It saves two queues, one for idle transaction objects, one for ready transaction objects
+/// @note - Transaction objects are allocated from DMA-able memory
+/// @note - Number of transaction objects are determined by the backlog parameter
+typedef struct {
+    async_memcpy_context_t parent; // Parent IO interface
+    size_t sram_trans_align;       // DMA transfer alignment (both in size and address) for SRAM memory
+    size_t max_single_dma_buffer;  // max DMA buffer size by a single descriptor
+    cp_dma_hal_context_t hal;      // CPDMA hal
+    intr_handle_t intr;            // CPDMA interrupt handle
+    portMUX_TYPE spin_lock;           // spin lock to avoid threads and isr from accessing the same resource simultaneously
+    _Atomic async_memcpy_fsm_t fsm;   // driver state machine, changing state should be atomic
+    async_memcpy_transaction_t *transaction_pool; // transaction object pool
+    STAILQ_HEAD(, async_memcpy_transaction_t) idle_queue_head;  // Head of the idle queue
+    STAILQ_HEAD(, async_memcpy_transaction_t) ready_queue_head; // Head of the ready queue
+} async_memcpy_cpdma_context_t;
+
+static void mcp_default_isr_handler(void *args);
+static esp_err_t mcp_cpdma_del(async_memcpy_context_t *ctx);
+static esp_err_t mcp_cpdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args);
+
+static esp_err_t mcp_cpdma_destroy(async_memcpy_cpdma_context_t *mcp_dma)
+{
+    if (mcp_dma->transaction_pool) {
+        free(mcp_dma->transaction_pool);
+    }
+    if (mcp_dma->intr) {
+        esp_intr_free(mcp_dma->intr);
+    }
+    if (mcp_dma->hal.dev) { // this is for check if the hal is initialized
+        cp_dma_hal_stop(&mcp_dma->hal);
+        cp_dma_hal_deinit(&mcp_dma->hal);
+    }
+    free(mcp_dma);
+    return ESP_OK;
+}
+
+esp_err_t esp_async_memcpy_install_cpdma(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp)
+{
+    esp_err_t ret = ESP_OK;
+    async_memcpy_cpdma_context_t *mcp_dma = NULL;
+    ESP_RETURN_ON_FALSE(config && mcp, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
+    // allocate memory of driver context from internal memory
+    mcp_dma = heap_caps_calloc(1, sizeof(async_memcpy_cpdma_context_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+    ESP_GOTO_ON_FALSE(mcp_dma, ESP_ERR_NO_MEM, err, TAG, "no mem for driver context");
+    uint32_t trans_queue_len = config->backlog ? config->backlog : DEFAULT_TRANSACTION_QUEUE_LENGTH;
+    // allocate memory for transaction pool, aligned to 4 because the trans->eof_node requires that alignment
+    mcp_dma->transaction_pool = heap_caps_aligned_calloc(4, trans_queue_len, sizeof(async_memcpy_transaction_t),
+                                MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
+    ESP_GOTO_ON_FALSE(mcp_dma->transaction_pool, ESP_ERR_NO_MEM, err, TAG, "no mem for transaction pool");
+
+    // Init hal context
+    cp_dma_hal_config_t hal_config = {};
+    cp_dma_hal_init(&mcp_dma->hal, &hal_config);
+
+    ESP_GOTO_ON_ERROR(esp_intr_alloc(ETS_DMA_COPY_INTR_SOURCE, 0, mcp_default_isr_handler, mcp_dma, &mcp_dma->intr),
+                      err, TAG, "install isr failed");
+
+    // initialize transaction queue
+    STAILQ_INIT(&mcp_dma->idle_queue_head);
+    STAILQ_INIT(&mcp_dma->ready_queue_head);
+    // pick transactions from the pool and insert to the idle queue
+    for (int i = 0; i < trans_queue_len; i++) {
+        STAILQ_INSERT_TAIL(&mcp_dma->idle_queue_head, &mcp_dma->transaction_pool[i], idle_queue_entry);
+    }
+
+    // initialize other members
+    portMUX_INITIALIZE(&mcp_dma->spin_lock);
+    atomic_init(&mcp_dma->fsm, MCP_FSM_IDLE);
+    mcp_dma->sram_trans_align = config->sram_trans_align;
+    size_t trans_align = config->sram_trans_align;
+    mcp_dma->max_single_dma_buffer = trans_align ? ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, trans_align) : DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
+    mcp_dma->parent.del = mcp_cpdma_del;
+    mcp_dma->parent.memcpy = mcp_cpdma_memcpy;
+    // return driver object
+    *mcp = &mcp_dma->parent;
+    return ESP_OK;
+
+err:
+    if (mcp_dma) {
+        mcp_cpdma_destroy(mcp_dma);
+    }
+    return ret;
+}
+
+esp_err_t esp_async_memcpy_install(const async_memcpy_config_t *config, async_memcpy_handle_t *asmcp)
+__attribute__((alias("esp_async_memcpy_install_cpdma")));
+
+static esp_err_t mcp_cpdma_del(async_memcpy_context_t *ctx)
+{
+    async_memcpy_cpdma_context_t *mcp_dma = __containerof(ctx, async_memcpy_cpdma_context_t, parent);
+    // check if there are pending transactions
+    ESP_RETURN_ON_FALSE(STAILQ_EMPTY(&mcp_dma->ready_queue_head), ESP_ERR_INVALID_STATE, TAG, "there are pending transactions");
+    // check if the driver is in IDLE state
+    ESP_RETURN_ON_FALSE(atomic_load(&mcp_dma->fsm) == MCP_FSM_IDLE, ESP_ERR_INVALID_STATE, TAG, "driver is not in IDLE state");
+    return mcp_cpdma_destroy(mcp_dma);
+}
+
+static void mount_tx_buffer_to_dma(dma_descriptor_align4_t *desc_array, int num_desc,
+                                   uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer)
+{
+    uint32_t prepared_length = 0;
+    size_t len = buf_sz;
+    for (int i = 0; i < num_desc - 1; i++) {
+        desc_array[i].buffer = &buf[prepared_length];
+        desc_array[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+        desc_array[i].dw0.suc_eof = 0;
+        desc_array[i].dw0.size = max_single_dma_buffer;
+        desc_array[i].dw0.length = max_single_dma_buffer;
+        desc_array[i].next = &desc_array[i + 1];
+        prepared_length += max_single_dma_buffer;
+        len -= max_single_dma_buffer;
+    }
+    // take special care to the EOF descriptor
+    desc_array[num_desc - 1].buffer = &buf[prepared_length];
+    desc_array[num_desc - 1].next = NULL;
+    desc_array[num_desc - 1].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+    desc_array[num_desc - 1].dw0.suc_eof = 1;
+    desc_array[num_desc - 1].dw0.size = len;
+    desc_array[num_desc - 1].dw0.length = len;
+}
+
+static void mount_rx_buffer_to_dma(dma_descriptor_align4_t *desc_array, int num_desc, dma_descriptor_align4_t *eof_desc,
+                                   uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer)
+{
+    uint32_t prepared_length = 0;
+    size_t len = buf_sz;
+    if (desc_array) {
+        assert(num_desc > 0);
+        for (int i = 0; i < num_desc; i++) {
+            desc_array[i].buffer = &buf[prepared_length];
+            desc_array[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+            desc_array[i].dw0.size = max_single_dma_buffer;
+            desc_array[i].dw0.length = max_single_dma_buffer;
+            desc_array[i].next = &desc_array[i + 1];
+            prepared_length += max_single_dma_buffer;
+            len -= max_single_dma_buffer;
+        }
+        desc_array[num_desc - 1].next = eof_desc;
+    }
+    eof_desc->buffer = &buf[prepared_length];
+    eof_desc->next = NULL;
+    eof_desc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+    eof_desc->dw0.size = len;
+    eof_desc->dw0.length = len;
+}
+
+/// @brief help function to get one transaction from the ready queue
+/// @note this function is allowed to be called in ISR
+static async_memcpy_transaction_t *try_pop_trans_from_ready_queue(async_memcpy_cpdma_context_t *mcp_dma)
+{
+    async_memcpy_transaction_t *trans = NULL;
+    portENTER_CRITICAL_SAFE(&mcp_dma->spin_lock);
+    trans = STAILQ_FIRST(&mcp_dma->ready_queue_head);
+    if (trans) {
+        STAILQ_REMOVE_HEAD(&mcp_dma->ready_queue_head, ready_queue_entry);
+    }
+    portEXIT_CRITICAL_SAFE(&mcp_dma->spin_lock);
+    return trans;
+}
+
+/// @brief help function to start a pending transaction
+/// @note this function is allowed to be called in ISR
+static void try_start_pending_transaction(async_memcpy_cpdma_context_t *mcp_dma)
+{
+    async_memcpy_fsm_t expected_fsm = MCP_FSM_IDLE;
+    async_memcpy_transaction_t *trans = NULL;
+    if (atomic_compare_exchange_strong(&mcp_dma->fsm, &expected_fsm, MCP_FSM_RUN_WAIT)) {
+        trans = try_pop_trans_from_ready_queue(mcp_dma);
+        if (trans) {
+            atomic_store(&mcp_dma->fsm, MCP_FSM_RUN);
+            cp_dma_hal_set_desc_base_addr(&mcp_dma->hal, trans->tx_start_desc_addr, trans->rx_start_desc_addr);
+            cp_dma_hal_start(&mcp_dma->hal); // enable DMA and interrupt
+        } else {
+            atomic_store(&mcp_dma->fsm, MCP_FSM_IDLE);
+        }
+    }
+}
+
+/// @brief help function to get one transaction from the idle queue
+/// @note this function is allowed to be called in ISR
+static async_memcpy_transaction_t *try_pop_trans_from_idle_queue(async_memcpy_cpdma_context_t *mcp_dma)
+{
+    async_memcpy_transaction_t *trans = NULL;
+    portENTER_CRITICAL_SAFE(&mcp_dma->spin_lock);
+    trans = STAILQ_FIRST(&mcp_dma->idle_queue_head);
+    if (trans) {
+        STAILQ_REMOVE_HEAD(&mcp_dma->idle_queue_head, idle_queue_entry);
+    }
+    portEXIT_CRITICAL_SAFE(&mcp_dma->spin_lock);
+    return trans;
+}
+
+static esp_err_t mcp_cpdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args)
+{
+    esp_err_t ret = ESP_OK;
+    async_memcpy_cpdma_context_t *mcp_dma = __containerof(ctx, async_memcpy_cpdma_context_t, parent);
+    ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "CP_DMA can only access SRAM");
+    // alignment check
+    if (mcp_dma->sram_trans_align) {
+        ESP_RETURN_ON_FALSE((((intptr_t)dst & (mcp_dma->sram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, TAG, "buffer address not aligned: %p -> %p", src, dst);
+        ESP_RETURN_ON_FALSE(((n & (mcp_dma->sram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, TAG,
+                            "copy size should align to %d bytes", mcp_dma->sram_trans_align);
+    }
+    async_memcpy_transaction_t *trans = NULL;
+    // pick one transaction node from idle queue
+    trans = try_pop_trans_from_idle_queue(mcp_dma);
+    // check if we get the transaction object successfully
+    ESP_RETURN_ON_FALSE(trans, ESP_ERR_INVALID_STATE, TAG, "no free node in the idle queue");
+
+    // calculate how many descriptors we want
+    size_t max_single_dma_buffer = mcp_dma->max_single_dma_buffer;
+    uint32_t num_desc_per_path = (n + max_single_dma_buffer - 1) / max_single_dma_buffer;
+    // allocate DMA descriptors, descriptors need a strict alignment
+    trans->tx_desc_link = heap_caps_aligned_calloc(4, num_desc_per_path, sizeof(dma_descriptor_align4_t),
+                          MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
+    ESP_GOTO_ON_FALSE(trans->tx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors");
+    // don't have to allocate the EOF descriptor, we will use trans->eof_node as the RX EOF descriptor
+    if (num_desc_per_path > 1) {
+        trans->rx_desc_link = heap_caps_aligned_calloc(4, num_desc_per_path - 1, sizeof(dma_descriptor_align4_t),
+                              MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
+        ESP_GOTO_ON_FALSE(trans->rx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors");
+    } else {
+        // small copy buffer, use the trans->eof_node is sufficient
+        trans->rx_desc_link = NULL;
+    }
+
+    // (preload) mount src data to the TX descriptor
+    mount_tx_buffer_to_dma(trans->tx_desc_link, num_desc_per_path, src, n, max_single_dma_buffer);
+    // (preload) mount dst data to the RX descriptor
+    mount_rx_buffer_to_dma(trans->rx_desc_link, num_desc_per_path - 1, &trans->eof_node, dst, n, max_single_dma_buffer);
+
+    // save other transaction context
+    trans->cb = cb_isr;
+    trans->cb_args = cb_args;
+    trans->tx_start_desc_addr = (intptr_t)trans->tx_desc_link;
+    trans->rx_start_desc_addr = trans->rx_desc_link ? (intptr_t)trans->rx_desc_link : (intptr_t)&trans->eof_node;
+
+    portENTER_CRITICAL(&mcp_dma->spin_lock);
+    // insert the trans to ready queue
+    STAILQ_INSERT_TAIL(&mcp_dma->ready_queue_head, trans, ready_queue_entry);
+    portEXIT_CRITICAL(&mcp_dma->spin_lock);
+
+    // check driver state, if there's no running transaction, start a new one
+    try_start_pending_transaction(mcp_dma);
+
+    return ESP_OK;
+
+err:
+    if (trans) {
+        if (trans->tx_desc_link) {
+            free(trans->tx_desc_link);
+            trans->tx_desc_link = NULL;
+        }
+        if (trans->rx_desc_link) {
+            free(trans->rx_desc_link);
+            trans->rx_desc_link = NULL;
+        }
+        // return back the trans to idle queue
+        portENTER_CRITICAL(&mcp_dma->spin_lock);
+        STAILQ_INSERT_TAIL(&mcp_dma->idle_queue_head, trans, idle_queue_entry);
+        portEXIT_CRITICAL(&mcp_dma->spin_lock);
+    }
+    return ret;
+}
+
+static void mcp_default_isr_handler(void *args)
+{
+    bool need_yield = false;
+    async_memcpy_cpdma_context_t *mcp_dma = (async_memcpy_cpdma_context_t *)args;
+    // get the interrupt status and clear it
+    uint32_t status = cp_dma_hal_get_intr_status(&mcp_dma->hal);
+    cp_dma_hal_clear_intr_status(&mcp_dma->hal, status);
+
+    // End-Of-Frame on RX side
+    if (status & CP_DMA_LL_EVENT_RX_EOF) {
+        dma_descriptor_align4_t *eof_desc = (dma_descriptor_align4_t *)cp_dma_ll_get_rx_eof_descriptor_address(mcp_dma->hal.dev);
+        // get the transaction object address by the EOF descriptor address
+        async_memcpy_transaction_t *trans = __containerof(eof_desc, async_memcpy_transaction_t, eof_node);
+
+        // switch driver state from RUN to IDLE
+        async_memcpy_fsm_t expected_fsm = MCP_FSM_RUN;
+        if (atomic_compare_exchange_strong(&mcp_dma->fsm, &expected_fsm, MCP_FSM_IDLE_WAIT)) {
+            // invoked callback registered by user
+            async_memcpy_isr_cb_t cb = trans->cb;
+            if (cb) {
+                async_memcpy_event_t e = {
+                    // No event data for now
+                };
+                need_yield = cb(&mcp_dma->parent, &e, trans->cb_args);
+            }
+            // recycle descriptor memory
+            free(trans->tx_desc_link);
+            free(trans->rx_desc_link);
+            trans->tx_desc_link = NULL;
+            trans->rx_desc_link = NULL;
+            trans->cb = NULL;
+
+            portENTER_CRITICAL_ISR(&mcp_dma->spin_lock);
+            // insert the trans object to the idle queue
+            STAILQ_INSERT_TAIL(&mcp_dma->idle_queue_head, trans, idle_queue_entry);
+            portEXIT_CRITICAL_ISR(&mcp_dma->spin_lock);
+
+            atomic_store(&mcp_dma->fsm, MCP_FSM_IDLE);
+        }
+
+        // try start the next pending transaction
+        try_start_pending_transaction(mcp_dma);
+    }
+
+    if (need_yield) {
+        portYIELD_FROM_ISR();
+    }
+}
--- a/components/esp_hw_support/dma/async_memcpy_gdma.c
+++ b/components/esp_hw_support/dma/async_memcpy_gdma.c
@ -0,0 +1,496 @@
+/*
+ * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <stdatomic.h>
+#include <sys/queue.h>
+#include <sys/param.h>
+#include "sdkconfig.h"
+#include "freertos/FreeRTOS.h"
+#include "soc/soc_caps.h"
+#include "esp_log.h"
+#include "esp_check.h"
+#include "esp_attr.h"
+#include "esp_err.h"
+#include "esp_private/gdma.h"
+#include "esp_memory_utils.h"
+#include "esp_async_memcpy.h"
+#include "esp_async_memcpy_priv.h"
+#include "hal/dma_types.h"
+#include "hal/cache_hal.h"
+#include "rom/cache.h"
+
+static const char *TAG = "async_mcp.gdma";
+
+#define MCP_NEEDS_INVALIDATE_DST_CACHE  CONFIG_IDF_TARGET_ESP32P4
+#define MCP_NEEDS_WRITE_BACK_SRC_CACHE  CONFIG_IDF_TARGET_ESP32P4
+#define MCP_NEEDS_WRITE_BACK_DESC_CACHE CONFIG_IDF_TARGET_ESP32P4
+
+#if SOC_AXI_GDMA_SUPPORTED
+#define MCP_DMA_DESC_ALIGN 64
+typedef dma_descriptor_align8_t mcp_dma_descriptor_t;
+#elif SOC_AHB_GDMA_SUPPORTED
+#define MCP_DMA_DESC_ALIGN 32
+typedef dma_descriptor_align4_t mcp_dma_descriptor_t;
+#else
+#error "Unsupported GDMA type"
+#endif
+
+/// @brief Transaction object for async memcpy
+/// @note - GDMA requires the DMA descriptors to be 4 or 8 bytes aligned
+/// @note - The DMA descriptor link list is allocated dynamically from DMA-able memory
+/// @note - Because of the eof_node, the transaction object should also be allocated from DMA-able memory
+typedef struct async_memcpy_transaction_t {
+    mcp_dma_descriptor_t eof_node;      // this is the DMA node which act as the EOF descriptor (RX path only)
+    mcp_dma_descriptor_t *tx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size
+    mcp_dma_descriptor_t *rx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size
+    intptr_t tx_start_desc_addr; // TX start descriptor address
+    intptr_t rx_start_desc_addr; // RX start descriptor address
+    intptr_t memcpy_dst_addr;    // memcpy destination address
+    size_t memcpy_size;          // memcpy size
+    async_memcpy_isr_cb_t cb;    // user callback
+    void *cb_args;               // user callback args
+    STAILQ_ENTRY(async_memcpy_transaction_t) idle_queue_entry;  // Entry for the idle queue
+    STAILQ_ENTRY(async_memcpy_transaction_t) ready_queue_entry; // Entry for the ready queue
+} async_memcpy_transaction_t;
+
+/// @brief Context of async memcpy driver
+/// @note - It saves two queues, one for idle transaction objects, one for ready transaction objects
+/// @note - Transaction objects are allocated from DMA-able memory
+/// @note - Number of transaction objects are determined by the backlog parameter
+typedef struct {
+    async_memcpy_context_t parent; // Parent IO interface
+    size_t sram_trans_align;       // DMA transfer alignment (both in size and address) for SRAM memory
+    size_t psram_trans_align;      // DMA transfer alignment (both in size and address) for PSRAM memory
+    size_t max_single_dma_buffer;  // max DMA buffer size by a single descriptor
+    int gdma_bus_id;               // GDMA bus id (AHB, AXI, etc.)
+    gdma_channel_handle_t tx_channel; // GDMA TX channel handle
+    gdma_channel_handle_t rx_channel; // GDMA RX channel handle
+    portMUX_TYPE spin_lock;           // spin lock to avoid threads and isr from accessing the same resource simultaneously
+    _Atomic async_memcpy_fsm_t fsm;   // driver state machine, changing state should be atomic
+    async_memcpy_transaction_t *transaction_pool; // transaction object pool
+    STAILQ_HEAD(, async_memcpy_transaction_t) idle_queue_head;  // Head of the idle queue
+    STAILQ_HEAD(, async_memcpy_transaction_t) ready_queue_head; // Head of the ready queue
+} async_memcpy_gdma_context_t;
+
+static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data);
+static esp_err_t mcp_gdma_del(async_memcpy_context_t *ctx);
+static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args);
+#if SOC_GDMA_SUPPORT_ETM
+static esp_err_t mcp_new_etm_event(async_memcpy_context_t *ctx, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event);
+#endif // SOC_GDMA_SUPPORT_ETM
+
+static esp_err_t mcp_gdma_destroy(async_memcpy_gdma_context_t *mcp_gdma)
+{
+    if (mcp_gdma->transaction_pool) {
+        free(mcp_gdma->transaction_pool);
+    }
+    if (mcp_gdma->tx_channel) {
+        gdma_disconnect(mcp_gdma->tx_channel);
+        gdma_del_channel(mcp_gdma->tx_channel);
+    }
+    if (mcp_gdma->rx_channel) {
+        gdma_disconnect(mcp_gdma->rx_channel);
+        gdma_del_channel(mcp_gdma->rx_channel);
+    }
+    free(mcp_gdma);
+    return ESP_OK;
+}
+
+static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp,
+        esp_err_t (*new_channel)(const gdma_channel_alloc_config_t *, gdma_channel_handle_t *), int gdma_bus_id)
+{
+    esp_err_t ret = ESP_OK;
+    async_memcpy_gdma_context_t *mcp_gdma = NULL;
+    ESP_RETURN_ON_FALSE(config && mcp, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
+    // allocate memory of driver context from internal memory
+    mcp_gdma = heap_caps_calloc(1, sizeof(async_memcpy_gdma_context_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT);
+    ESP_GOTO_ON_FALSE(mcp_gdma, ESP_ERR_NO_MEM, err, TAG, "no mem for driver context");
+    uint32_t trans_queue_len = config->backlog ? config->backlog : DEFAULT_TRANSACTION_QUEUE_LENGTH;
+    // allocate memory for transaction pool
+    mcp_gdma->transaction_pool = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, trans_queue_len, sizeof(async_memcpy_transaction_t),
+                                 MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
+    ESP_GOTO_ON_FALSE(mcp_gdma->transaction_pool, ESP_ERR_NO_MEM, err, TAG, "no mem for transaction pool");
+
+    // create TX channel and RX channel, they should reside in the same DMA pair
+    gdma_channel_alloc_config_t tx_alloc_config = {
+        .flags.reserve_sibling = 1,
+        .direction = GDMA_CHANNEL_DIRECTION_TX,
+    };
+    ESP_GOTO_ON_ERROR(new_channel(&tx_alloc_config, &mcp_gdma->tx_channel), err, TAG, "failed to create GDMA TX channel");
+    gdma_channel_alloc_config_t rx_alloc_config = {
+        .direction = GDMA_CHANNEL_DIRECTION_RX,
+        .sibling_chan = mcp_gdma->tx_channel,
+    };
+    ESP_GOTO_ON_ERROR(new_channel(&rx_alloc_config, &mcp_gdma->rx_channel), err, TAG, "failed to create GDMA RX channel");
+
+    // initialize GDMA channels
+    gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0);
+    // get a free DMA trigger ID for memory copy
+    uint32_t free_m2m_id_mask = 0;
+    gdma_get_free_m2m_trig_id_mask(mcp_gdma->tx_channel, &free_m2m_id_mask);
+    m2m_trigger.instance_id = __builtin_ctz(free_m2m_id_mask);
+    gdma_connect(mcp_gdma->rx_channel, m2m_trigger);
+    gdma_connect(mcp_gdma->tx_channel, m2m_trigger);
+
+    gdma_transfer_ability_t transfer_ability = {
+        .sram_trans_align = config->sram_trans_align,
+        .psram_trans_align = config->psram_trans_align,
+    };
+    ESP_GOTO_ON_ERROR(gdma_set_transfer_ability(mcp_gdma->tx_channel, &transfer_ability), err, TAG, "set tx trans ability failed");
+    ESP_GOTO_ON_ERROR(gdma_set_transfer_ability(mcp_gdma->rx_channel, &transfer_ability), err, TAG, "set rx trans ability failed");
+
+    // register rx eof callback
+    gdma_rx_event_callbacks_t cbs = {
+        .on_recv_eof = mcp_gdma_rx_eof_callback,
+    };
+    ESP_GOTO_ON_ERROR(gdma_register_rx_event_callbacks(mcp_gdma->rx_channel, &cbs, mcp_gdma), err, TAG, "failed to register RX EOF callback");
+
+    // initialize transaction queue
+    STAILQ_INIT(&mcp_gdma->idle_queue_head);
+    STAILQ_INIT(&mcp_gdma->ready_queue_head);
+    // pick transactions from the pool and insert to the idle queue
+    for (int i = 0; i < trans_queue_len; i++) {
+        STAILQ_INSERT_TAIL(&mcp_gdma->idle_queue_head, &mcp_gdma->transaction_pool[i], idle_queue_entry);
+    }
+
+    // initialize other members
+    portMUX_INITIALIZE(&mcp_gdma->spin_lock);
+    atomic_init(&mcp_gdma->fsm, MCP_FSM_IDLE);
+    mcp_gdma->gdma_bus_id = gdma_bus_id;
+    // if the psram_trans_align is configured to zero, we should fall back to use the data cache line size
+    uint32_t data_cache_line_size = cache_hal_get_cache_line_size(CACHE_TYPE_DATA);
+    size_t psram_trans_align = MAX(data_cache_line_size, config->psram_trans_align);
+    size_t trans_align = MAX(config->sram_trans_align, psram_trans_align);
+    mcp_gdma->max_single_dma_buffer = ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, trans_align);
+    mcp_gdma->psram_trans_align = psram_trans_align;
+    mcp_gdma->sram_trans_align = config->sram_trans_align;
+    mcp_gdma->parent.del = mcp_gdma_del;
+    mcp_gdma->parent.memcpy = mcp_gdma_memcpy;
+#if SOC_GDMA_SUPPORT_ETM
+    mcp_gdma->parent.new_etm_event = mcp_new_etm_event;
+#endif
+    // return driver object
+    *mcp = &mcp_gdma->parent;
+    return ESP_OK;
+
+err:
+    if (mcp_gdma) {
+        mcp_gdma_destroy(mcp_gdma);
+    }
+    return ret;
+}
+
+#if SOC_AHB_GDMA_SUPPORTED
+esp_err_t esp_async_memcpy_install_gdma_ahb(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp)
+{
+    return esp_async_memcpy_install_gdma_template(config, mcp, gdma_new_ahb_channel, SOC_GDMA_BUS_AHB);
+}
+
+/// default installation falls back to use the AHB GDMA
+esp_err_t esp_async_memcpy_install(const async_memcpy_config_t *config, async_memcpy_handle_t *asmcp)
+__attribute__((alias("esp_async_memcpy_install_gdma_ahb")));
+#endif // SOC_AHB_GDMA_SUPPORTED
+
+#if SOC_AXI_GDMA_SUPPORTED
+esp_err_t esp_async_memcpy_install_gdma_axi(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp)
+{
+    return esp_async_memcpy_install_gdma_template(config, mcp, gdma_new_axi_channel, SOC_GDMA_BUS_AXI);
+}
+#endif // SOC_AXI_GDMA_SUPPORTED
+
+static esp_err_t mcp_gdma_del(async_memcpy_context_t *ctx)
+{
+    async_memcpy_gdma_context_t *mcp_gdma = __containerof(ctx, async_memcpy_gdma_context_t, parent);
+    // check if there are pending transactions
+    ESP_RETURN_ON_FALSE(STAILQ_EMPTY(&mcp_gdma->ready_queue_head), ESP_ERR_INVALID_STATE, TAG, "there are pending transactions");
+    // check if the driver is in IDLE state
+    ESP_RETURN_ON_FALSE(atomic_load(&mcp_gdma->fsm) == MCP_FSM_IDLE, ESP_ERR_INVALID_STATE, TAG, "driver is not in IDLE state");
+    return mcp_gdma_destroy(mcp_gdma);
+}
+
+static void mount_tx_buffer_to_dma(mcp_dma_descriptor_t *desc_array, int num_desc,
+                                   uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer)
+{
+    uint32_t prepared_length = 0;
+    size_t len = buf_sz;
+    for (int i = 0; i < num_desc - 1; i++) {
+        desc_array[i].buffer = &buf[prepared_length];
+        desc_array[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+        desc_array[i].dw0.suc_eof = 0;
+        desc_array[i].dw0.size = max_single_dma_buffer;
+        desc_array[i].dw0.length = max_single_dma_buffer;
+        desc_array[i].next = &desc_array[i + 1];
+        prepared_length += max_single_dma_buffer;
+        len -= max_single_dma_buffer;
+    }
+    // take special care to the EOF descriptor
+    desc_array[num_desc - 1].buffer = &buf[prepared_length];
+    desc_array[num_desc - 1].next = NULL;
+    desc_array[num_desc - 1].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+    desc_array[num_desc - 1].dw0.suc_eof = 1;
+    desc_array[num_desc - 1].dw0.size = len;
+    desc_array[num_desc - 1].dw0.length = len;
+
+#if MCP_NEEDS_WRITE_BACK_DESC_CACHE
+    Cache_WriteBack_Addr(CACHE_MAP_L1_DCACHE, (uint32_t)desc_array, sizeof(mcp_dma_descriptor_t) * num_desc);
+#endif
+}
+
+static void mount_rx_buffer_to_dma(mcp_dma_descriptor_t *desc_array, int num_desc, mcp_dma_descriptor_t *eof_desc,
+                                   uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer)
+{
+    uint32_t prepared_length = 0;
+    size_t len = buf_sz;
+    if (desc_array) {
+        assert(num_desc > 0);
+        for (int i = 0; i < num_desc; i++) {
+            desc_array[i].buffer = &buf[prepared_length];
+            desc_array[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+            desc_array[i].dw0.size = max_single_dma_buffer;
+            desc_array[i].dw0.length = max_single_dma_buffer;
+            desc_array[i].next = &desc_array[i + 1];
+            prepared_length += max_single_dma_buffer;
+            len -= max_single_dma_buffer;
+        }
+        desc_array[num_desc - 1].next = eof_desc;
+    }
+    eof_desc->buffer = &buf[prepared_length];
+    eof_desc->next = NULL;
+    eof_desc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
+    eof_desc->dw0.size = len;
+    eof_desc->dw0.length = len;
+
+#if MCP_NEEDS_WRITE_BACK_DESC_CACHE
+    if (desc_array) {
+        Cache_WriteBack_Addr(CACHE_MAP_L1_DCACHE, (uint32_t)desc_array, sizeof(mcp_dma_descriptor_t) * num_desc);
+    }
+    Cache_WriteBack_Addr(CACHE_MAP_L1_DCACHE, (uint32_t)eof_desc, sizeof(mcp_dma_descriptor_t));
+#endif
+}
+
+/// @brief help function to get one transaction from the ready queue
+/// @note this function is allowed to be called in ISR
+static async_memcpy_transaction_t *try_pop_trans_from_ready_queue(async_memcpy_gdma_context_t *mcp_gdma)
+{
+    async_memcpy_transaction_t *trans = NULL;
+    portENTER_CRITICAL_SAFE(&mcp_gdma->spin_lock);
+    trans = STAILQ_FIRST(&mcp_gdma->ready_queue_head);
+    if (trans) {
+        STAILQ_REMOVE_HEAD(&mcp_gdma->ready_queue_head, ready_queue_entry);
+    }
+    portEXIT_CRITICAL_SAFE(&mcp_gdma->spin_lock);
+    return trans;
+}
+
+/// @brief help function to start a pending transaction
+/// @note this function is allowed to be called in ISR
+static void try_start_pending_transaction(async_memcpy_gdma_context_t *mcp_gdma)
+{
+    async_memcpy_fsm_t expected_fsm = MCP_FSM_IDLE;
+    async_memcpy_transaction_t *trans = NULL;
+    if (atomic_compare_exchange_strong(&mcp_gdma->fsm, &expected_fsm, MCP_FSM_RUN_WAIT)) {
+        trans = try_pop_trans_from_ready_queue(mcp_gdma);
+        if (trans) {
+            atomic_store(&mcp_gdma->fsm, MCP_FSM_RUN);
+            gdma_start(mcp_gdma->rx_channel, trans->rx_start_desc_addr);
+            gdma_start(mcp_gdma->tx_channel, trans->tx_start_desc_addr);
+        } else {
+            atomic_store(&mcp_gdma->fsm, MCP_FSM_IDLE);
+        }
+    }
+}
+
+/// @brief help function to get one transaction from the idle queue
+/// @note this function is allowed to be called in ISR
+static async_memcpy_transaction_t *try_pop_trans_from_idle_queue(async_memcpy_gdma_context_t *mcp_gdma)
+{
+    async_memcpy_transaction_t *trans = NULL;
+    portENTER_CRITICAL_SAFE(&mcp_gdma->spin_lock);
+    trans = STAILQ_FIRST(&mcp_gdma->idle_queue_head);
+    if (trans) {
+        STAILQ_REMOVE_HEAD(&mcp_gdma->idle_queue_head, idle_queue_entry);
+    }
+    portEXIT_CRITICAL_SAFE(&mcp_gdma->spin_lock);
+    return trans;
+}
+
+static bool check_buffer_aligned(async_memcpy_gdma_context_t *mcp_gdma, void *src, void *dst, size_t n)
+{
+    bool valid = true;
+    if (esp_ptr_external_ram(dst)) {
+        if (mcp_gdma->psram_trans_align) {
+            valid = valid && (((intptr_t)dst & (mcp_gdma->psram_trans_align - 1)) == 0);
+            valid = valid && ((n & (mcp_gdma->psram_trans_align - 1)) == 0);
+        }
+    } else {
+        if (mcp_gdma->sram_trans_align) {
+            valid = valid && (((intptr_t)dst & (mcp_gdma->sram_trans_align - 1)) == 0);
+            valid = valid && ((n & (mcp_gdma->sram_trans_align - 1)) == 0);
+        }
+    }
+    return valid;
+}
+
+static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args)
+{
+    esp_err_t ret = ESP_OK;
+    async_memcpy_gdma_context_t *mcp_gdma = __containerof(ctx, async_memcpy_gdma_context_t, parent);
+    // buffer location check
+#if SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM
+    if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AHB) {
+        ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AHB GDMA can only access SRAM");
+    }
+#endif // SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM
+#if SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM
+    if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AXI) {
+        ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AXI_DMA can only access SRAM");
+    }
+#endif // SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM
+    // alignment check
+    ESP_RETURN_ON_FALSE(check_buffer_aligned(mcp_gdma, src, dst, n), ESP_ERR_INVALID_ARG, TAG, "buffer not aligned: %p -> %p, sz=%zu", src, dst, n);
+
+    async_memcpy_transaction_t *trans = NULL;
+    // pick one transaction node from idle queue
+    trans = try_pop_trans_from_idle_queue(mcp_gdma);
+    // check if we get the transaction object successfully
+    ESP_RETURN_ON_FALSE(trans, ESP_ERR_INVALID_STATE, TAG, "no free node in the idle queue");
+
+    // calculate how many descriptors we want
+    size_t max_single_dma_buffer = mcp_gdma->max_single_dma_buffer;
+    uint32_t num_desc_per_path = (n + max_single_dma_buffer - 1) / max_single_dma_buffer;
+    // allocate DMA descriptors, descriptors need a strict alignment
+    trans->tx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path, sizeof(mcp_dma_descriptor_t),
+                          MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
+    ESP_GOTO_ON_FALSE(trans->tx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors");
+    // don't have to allocate the EOF descriptor, we will use trans->eof_node as the RX EOF descriptor
+    if (num_desc_per_path > 1) {
+        trans->rx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path - 1, sizeof(mcp_dma_descriptor_t),
+                              MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA);
+        ESP_GOTO_ON_FALSE(trans->rx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors");
+    } else {
+        // small copy buffer, use the trans->eof_node is sufficient
+        trans->rx_desc_link = NULL;
+    }
+
+    // (preload) mount src data to the TX descriptor
+    mount_tx_buffer_to_dma(trans->tx_desc_link, num_desc_per_path, src, n, max_single_dma_buffer);
+    // (preload) mount dst data to the RX descriptor
+    mount_rx_buffer_to_dma(trans->rx_desc_link, num_desc_per_path - 1, &trans->eof_node, dst, n, max_single_dma_buffer);
+
+    // if the data is in the cache, write back, then DMA can see the latest data
+#if MCP_NEEDS_WRITE_BACK_SRC_CACHE
+    int write_back_map = CACHE_MAP_L1_DCACHE;
+    if (esp_ptr_external_ram(src)) {
+        write_back_map |= CACHE_MAP_L2_CACHE;
+    }
+    Cache_WriteBack_Addr(write_back_map, (uint32_t)src, n);
+#endif
+
+    // save other transaction context
+    trans->cb = cb_isr;
+    trans->cb_args = cb_args;
+    trans->memcpy_size = n;
+    trans->memcpy_dst_addr = (intptr_t)dst;
+    trans->tx_start_desc_addr = (intptr_t)trans->tx_desc_link;
+    trans->rx_start_desc_addr = trans->rx_desc_link ? (intptr_t)trans->rx_desc_link : (intptr_t)&trans->eof_node;
+
+    portENTER_CRITICAL(&mcp_gdma->spin_lock);
+    // insert the trans to ready queue
+    STAILQ_INSERT_TAIL(&mcp_gdma->ready_queue_head, trans, ready_queue_entry);
+    portEXIT_CRITICAL(&mcp_gdma->spin_lock);
+
+    // check driver state, if there's no running transaction, start a new one
+    try_start_pending_transaction(mcp_gdma);
+
+    return ESP_OK;
+
+err:
+    if (trans) {
+        if (trans->tx_desc_link) {
+            free(trans->tx_desc_link);
+            trans->tx_desc_link = NULL;
+        }
+        if (trans->rx_desc_link) {
+            free(trans->rx_desc_link);
+            trans->rx_desc_link = NULL;
+        }
+        // return back the trans to idle queue
+        portENTER_CRITICAL(&mcp_gdma->spin_lock);
+        STAILQ_INSERT_TAIL(&mcp_gdma->idle_queue_head, trans, idle_queue_entry);
+        portEXIT_CRITICAL(&mcp_gdma->spin_lock);
+    }
+    return ret;
+}
+
+static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data)
+{
+    bool need_yield = false;
+    async_memcpy_gdma_context_t *mcp_gdma = (async_memcpy_gdma_context_t *)user_data;
+    mcp_dma_descriptor_t *eof_desc = (mcp_dma_descriptor_t *)event_data->rx_eof_desc_addr;
+    // get the transaction object address by the EOF descriptor address
+    async_memcpy_transaction_t *trans = __containerof(eof_desc, async_memcpy_transaction_t, eof_node);
+
+    // switch driver state from RUN to IDLE
+    async_memcpy_fsm_t expected_fsm = MCP_FSM_RUN;
+    if (atomic_compare_exchange_strong(&mcp_gdma->fsm, &expected_fsm, MCP_FSM_IDLE_WAIT)) {
+        // if the data is in the cache, invalidate, then CPU can see the latest data
+#if MCP_NEEDS_INVALIDATE_DST_CACHE
+        int write_back_map = CACHE_MAP_L1_DCACHE;
+        if (esp_ptr_external_ram((const void *)trans->memcpy_dst_addr)) {
+            write_back_map |= CACHE_MAP_L2_CACHE;
+        }
+        Cache_Invalidate_Addr(write_back_map, (uint32_t)trans->memcpy_dst_addr, trans->memcpy_size);
+#endif
+
+        // invoked callback registered by user
+        async_memcpy_isr_cb_t cb = trans->cb;
+        if (cb) {
+            async_memcpy_event_t e = {
+                // No event data for now
+            };
+            need_yield = cb(&mcp_gdma->parent, &e, trans->cb_args);
+        }
+        // recycle descriptor memory
+        if (trans->tx_desc_link) {
+            free(trans->tx_desc_link);
+            trans->tx_desc_link = NULL;
+        }
+        if (trans->rx_desc_link) {
+            free(trans->rx_desc_link);
+            trans->rx_desc_link = NULL;
+        }
+        trans->cb = NULL;
+
+        portENTER_CRITICAL_ISR(&mcp_gdma->spin_lock);
+        // insert the trans object to the idle queue
+        STAILQ_INSERT_TAIL(&mcp_gdma->idle_queue_head, trans, idle_queue_entry);
+        portEXIT_CRITICAL_ISR(&mcp_gdma->spin_lock);
+
+        atomic_store(&mcp_gdma->fsm, MCP_FSM_IDLE);
+    }
+
+    // try start the next pending transaction
+    try_start_pending_transaction(mcp_gdma);
+
+    return need_yield;
+}
+
+#if SOC_GDMA_SUPPORT_ETM
+static esp_err_t mcp_new_etm_event(async_memcpy_context_t *ctx, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event)
+{
+    async_memcpy_gdma_context_t *mcp_gdma = __containerof(ctx, async_memcpy_gdma_context_t, parent);
+    if (event_type == ASYNC_MEMCPY_ETM_EVENT_COPY_DONE) {
+        // use the RX EOF to indicate the async memcpy done event
+        gdma_etm_event_config_t etm_event_conf = {
+            .event_type = GDMA_ETM_EVENT_EOF,
+        };
+        return gdma_new_etm_event(mcp_gdma->rx_channel, &etm_event_conf, out_event);
+    } else {
+        return ESP_ERR_NOT_SUPPORTED;
+    }
+}
+#endif // SOC_GDMA_SUPPORT_ETM
--- a/components/esp_hw_support/dma/async_memcpy_impl_cp_dma.c
+++ b/components/esp_hw_support/dma/async_memcpy_impl_cp_dma.c
@ -1,92 +0,0 @@
-/*
- * SPDX-FileCopyrightText: 2020-2021 Espressif Systems (Shanghai) CO LTD
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "freertos/FreeRTOS.h"
-#include "soc/periph_defs.h"
-#include "soc/soc_memory_layout.h"
-#include "hal/cp_dma_hal.h"
-#include "hal/cp_dma_ll.h"
-#include "esp_log.h"
-#include "esp_attr.h"
-#include "esp_err.h"
-#include "esp_etm.h"
-#include "esp_async_memcpy_impl.h"
-
-IRAM_ATTR static void async_memcpy_impl_default_isr_handler(void *args)
-{
-    async_memcpy_impl_t *mcp_impl = (async_memcpy_impl_t *)args;
-
-    portENTER_CRITICAL_ISR(&mcp_impl->hal_lock);
-    uint32_t status = cp_dma_hal_get_intr_status(&mcp_impl->hal);
-    cp_dma_hal_clear_intr_status(&mcp_impl->hal, status);
-    portEXIT_CRITICAL_ISR(&mcp_impl->hal_lock);
-
-    // End-Of-Frame on RX side
-    if (status & CP_DMA_LL_EVENT_RX_EOF) {
-        mcp_impl->rx_eof_addr = cp_dma_ll_get_rx_eof_descriptor_address(mcp_impl->hal.dev);
-        async_memcpy_isr_on_rx_done_event(mcp_impl);
-    }
-
-    if (mcp_impl->isr_need_yield) {
-        mcp_impl->isr_need_yield = false;
-        portYIELD_FROM_ISR();
-    }
-}
-
-esp_err_t async_memcpy_impl_init(async_memcpy_impl_t *impl)
-{
-    esp_err_t ret = ESP_OK;
-
-    impl->hal_lock = (portMUX_TYPE)portMUX_INITIALIZER_UNLOCKED;
-    cp_dma_hal_config_t config = {};
-    cp_dma_hal_init(&impl->hal, &config);
-
-    ret = esp_intr_alloc(ETS_DMA_COPY_INTR_SOURCE, ESP_INTR_FLAG_IRAM, async_memcpy_impl_default_isr_handler, impl, &impl->intr);
-    return ret;
-}
-
-esp_err_t async_memcpy_impl_deinit(async_memcpy_impl_t *impl)
-{
-    esp_err_t ret = ESP_OK;
-
-    cp_dma_hal_deinit(&impl->hal);
-    ret = esp_intr_free(impl->intr);
-    return ret;
-}
-
-esp_err_t async_memcpy_impl_start(async_memcpy_impl_t *impl, intptr_t outlink_base, intptr_t inlink_base)
-{
-    cp_dma_hal_set_desc_base_addr(&impl->hal, outlink_base, inlink_base);
-    cp_dma_hal_start(&impl->hal); // enable DMA and interrupt
-    return ESP_OK;
-}
-
-esp_err_t async_memcpy_impl_stop(async_memcpy_impl_t *impl)
-{
-    cp_dma_hal_stop(&impl->hal); // disable DMA and interrupt
-    return ESP_OK;
-}
-
-esp_err_t async_memcpy_impl_restart(async_memcpy_impl_t *impl)
-{
-    cp_dma_hal_restart_rx(&impl->hal);
-    cp_dma_hal_restart_tx(&impl->hal);
-    return ESP_OK;
-}
-
-esp_err_t async_memcpy_impl_new_etm_event(async_memcpy_impl_t *impl, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event)
-{
-    (void)impl;
-    (void)event_type;
-    (void)out_event;
-    return ESP_ERR_NOT_SUPPORTED;
-}
-
-bool async_memcpy_impl_is_buffer_address_valid(async_memcpy_impl_t *impl, void *src, void *dst)
-{
-    // CP_DMA can only access SRAM
-    return esp_ptr_internal(src) && esp_ptr_internal(dst);
-}
--- a/components/esp_hw_support/dma/async_memcpy_impl_gdma.c
+++ b/components/esp_hw_support/dma/async_memcpy_impl_gdma.c
@ -1,151 +0,0 @@
-/*
- * SPDX-FileCopyrightText: 2020-2021 Espressif Systems (Shanghai) CO LTD
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "freertos/FreeRTOS.h"
-#include "soc/periph_defs.h"
-#include "soc/soc_memory_layout.h"
-#include "soc/soc_caps.h"
-#include "esp_private/periph_ctrl.h"
-#include "esp_log.h"
-#include "esp_attr.h"
-#include "esp_err.h"
-#include "esp_async_memcpy_impl.h"
-#if SOC_APM_SUPPORTED
-#include "hal/apm_ll.h"
-#endif
-
-IRAM_ATTR static bool async_memcpy_impl_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data)
-{
-    async_memcpy_impl_t *mcp_impl = (async_memcpy_impl_t *)user_data;
-    mcp_impl->rx_eof_addr = event_data->rx_eof_desc_addr;
-
-    async_memcpy_isr_on_rx_done_event(mcp_impl);
-    return mcp_impl->isr_need_yield;
-}
-
-esp_err_t async_memcpy_impl_init(async_memcpy_impl_t *impl)
-{
-    esp_err_t ret = ESP_OK;
-    // create TX channel and reserve sibling channel for future use
-    gdma_channel_alloc_config_t tx_alloc_config = {
-        .flags.reserve_sibling = 1,
-        .direction = GDMA_CHANNEL_DIRECTION_TX,
-    };
-    ret = gdma_new_channel(&tx_alloc_config, &impl->tx_channel);
-    if (ret != ESP_OK) {
-        goto err;
-    }
-
-    // create RX channel and specify it should be reside in the same pair as TX
-    gdma_channel_alloc_config_t rx_alloc_config = {
-        .direction = GDMA_CHANNEL_DIRECTION_RX,
-        .sibling_chan = impl->tx_channel,
-    };
-    ret = gdma_new_channel(&rx_alloc_config, &impl->rx_channel);
-    if (ret != ESP_OK) {
-        goto err;
-    }
-
-    gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0);
-    // get a free DMA trigger ID for memory copy
-    uint32_t free_m2m_id_mask = 0;
-    gdma_get_free_m2m_trig_id_mask(impl->tx_channel, &free_m2m_id_mask);
-    m2m_trigger.instance_id = __builtin_ctz(free_m2m_id_mask);
-    gdma_connect(impl->rx_channel, m2m_trigger);
-    gdma_connect(impl->tx_channel, m2m_trigger);
-
-    gdma_strategy_config_t strategy_config = {
-        .auto_update_desc = true,
-        .owner_check = true,
-    };
-
-    gdma_transfer_ability_t transfer_ability = {
-        .sram_trans_align = impl->sram_trans_align,
-        .psram_trans_align = impl->psram_trans_align,
-    };
-    ret = gdma_set_transfer_ability(impl->tx_channel, &transfer_ability);
-    if (ret != ESP_OK) {
-        goto err;
-    }
-    ret = gdma_set_transfer_ability(impl->rx_channel, &transfer_ability);
-    if (ret != ESP_OK) {
-        goto err;
-    }
-    gdma_apply_strategy(impl->tx_channel, &strategy_config);
-    gdma_apply_strategy(impl->rx_channel, &strategy_config);
-
-#if SOC_APM_SUPPORTED
-    // APM strategy: trusted mode
-    // TODO: IDF-5354 GDMA for M2M usage only need read and write permissions, we should disable the execute permission by the APM controller
-    apm_tee_ll_set_master_secure_mode(APM_LL_MASTER_GDMA + m2m_trigger.instance_id, APM_LL_SECURE_MODE_TEE);
-#endif // SOC_APM_SUPPORTED
-
-    gdma_rx_event_callbacks_t cbs = {
-        .on_recv_eof = async_memcpy_impl_rx_eof_callback
-    };
-    ret = gdma_register_rx_event_callbacks(impl->rx_channel, &cbs, impl);
-
-err:
-    return ret;
-}
-
-esp_err_t async_memcpy_impl_deinit(async_memcpy_impl_t *impl)
-{
-    gdma_disconnect(impl->rx_channel);
-    gdma_disconnect(impl->tx_channel);
-    gdma_del_channel(impl->rx_channel);
-    gdma_del_channel(impl->tx_channel);
-    return ESP_OK;
-}
-
-esp_err_t async_memcpy_impl_start(async_memcpy_impl_t *impl, intptr_t outlink_base, intptr_t inlink_base)
-{
-    gdma_start(impl->rx_channel, inlink_base);
-    gdma_start(impl->tx_channel, outlink_base);
-    return ESP_OK;
-}
-
-esp_err_t async_memcpy_impl_stop(async_memcpy_impl_t *impl)
-{
-    gdma_stop(impl->rx_channel);
-    gdma_stop(impl->tx_channel);
-    return ESP_OK;
-}
-
-esp_err_t async_memcpy_impl_restart(async_memcpy_impl_t *impl)
-{
-    gdma_append(impl->rx_channel);
-    gdma_append(impl->tx_channel);
-    return ESP_OK;
-}
-
-esp_err_t async_memcpy_impl_new_etm_event(async_memcpy_impl_t *impl, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event)
-{
-    if (event_type == ASYNC_MEMCPY_ETM_EVENT_COPY_DONE) {
-        // use the RX EOF to indicate the async memcpy done event
-        gdma_etm_event_config_t etm_event_conf = {
-            .event_type = GDMA_ETM_EVENT_EOF,
-        };
-        return gdma_new_etm_event(impl->rx_channel, &etm_event_conf, out_event);
-    } else {
-        return ESP_ERR_NOT_SUPPORTED;
-    }
-}
-
-bool async_memcpy_impl_is_buffer_address_valid(async_memcpy_impl_t *impl, void *src, void *dst)
-{
-    bool valid = true;
-    if (esp_ptr_external_ram(dst)) {
-        if (impl->psram_trans_align) {
-            valid = valid && (((intptr_t)dst & (impl->psram_trans_align - 1)) == 0);
-        }
-    } else {
-        if (impl->sram_trans_align) {
-            valid = valid && (((intptr_t)dst & (impl->sram_trans_align - 1)) == 0);
-        }
-    }
-    return valid;
-}
--- a/components/esp_hw_support/dma/esp_async_memcpy.c
+++ b/components/esp_hw_support/dma/esp_async_memcpy.c
@ -1,311 +1,31 @@
 /*
- * SPDX-FileCopyrightText: 2020-2022 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

-#include <sys/param.h>
-#include "freertos/FreeRTOS.h"
-#include "freertos/semphr.h"
-#include "hal/dma_types.h"
 #include "esp_check.h"
-#include "esp_heap_caps.h"
-#include "esp_log.h"
 #include "esp_async_memcpy.h"
-#include "esp_async_memcpy_impl.h"
+#include "esp_async_memcpy_priv.h"

-static const char *TAG = "async_memcpy";
+static const char *TAG = "async_mcp";

-#define ALIGN_DOWN(val, align)  ((val) & ~((align) - 1))
-
-/**
- * @brief Type of async mcp stream
- *        mcp stream inherits DMA descriptor, besides that, it has a callback function member
- */
-typedef struct {
-    dma_descriptor_t desc;
-    async_memcpy_isr_cb_t cb;
-    void *cb_args;
-} async_memcpy_stream_t;
-
-/**
- * @brief Type of async mcp driver context
- */
-typedef struct async_memcpy_context_t {
-    async_memcpy_impl_t mcp_impl; // implementation layer
-    portMUX_TYPE spinlock;     // spinlock, prevent operating descriptors concurrently
-    intr_handle_t intr_hdl;    // interrupt handle
-    uint32_t flags;            // extra driver flags
-    dma_descriptor_t *tx_desc; // pointer to the next free TX descriptor
-    dma_descriptor_t *rx_desc; // pointer to the next free RX descriptor
-    dma_descriptor_t *next_rx_desc_to_check; // pointer to the next RX descriptor to recycle
-    uint32_t max_stream_num;    // maximum number of streams
-    size_t max_dma_buffer_size; // maximum DMA buffer size
-    async_memcpy_stream_t *out_streams;    // pointer to the first TX stream
-    async_memcpy_stream_t *in_streams;     // pointer to the first RX stream
-    async_memcpy_stream_t streams_pool[0]; // stream pool (TX + RX), the size is configured during driver installation
-} async_memcpy_context_t;
-
-esp_err_t esp_async_memcpy_install(const async_memcpy_config_t *config, async_memcpy_t *asmcp)
+esp_err_t esp_async_memcpy_uninstall(async_memcpy_handle_t asmcp)
 {
-    esp_err_t ret = ESP_OK;
-    async_memcpy_context_t *mcp_hdl = NULL;
-
-    ESP_GOTO_ON_FALSE(config, ESP_ERR_INVALID_ARG, err, TAG, "configuration can't be null");
-    ESP_GOTO_ON_FALSE(asmcp, ESP_ERR_INVALID_ARG, err, TAG, "can't assign mcp handle to null");
-
-    // context memory size + stream pool size
-    size_t total_malloc_size = sizeof(async_memcpy_context_t) + sizeof(async_memcpy_stream_t) * config->backlog * 2;
-    // to work when cache is disabled, the driver handle should located in SRAM
-    mcp_hdl = heap_caps_calloc(1, total_malloc_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
-    ESP_GOTO_ON_FALSE(mcp_hdl, ESP_ERR_NO_MEM, err, TAG, "allocate context memory failed");
-
-    mcp_hdl->flags = config->flags;
-    mcp_hdl->out_streams = mcp_hdl->streams_pool;
-    mcp_hdl->in_streams = mcp_hdl->streams_pool + config->backlog;
-    mcp_hdl->max_stream_num = config->backlog;
-
-    // circle TX/RX descriptors
-    for (size_t i = 0; i < mcp_hdl->max_stream_num; i++) {
-        mcp_hdl->out_streams[i].desc.dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_CPU;
-        mcp_hdl->out_streams[i].desc.next = &mcp_hdl->out_streams[i + 1].desc;
-        mcp_hdl->in_streams[i].desc.dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_CPU;
-        mcp_hdl->in_streams[i].desc.next = &mcp_hdl->in_streams[i + 1].desc;
-    }
-    mcp_hdl->out_streams[mcp_hdl->max_stream_num - 1].desc.next = &mcp_hdl->out_streams[0].desc;
-    mcp_hdl->in_streams[mcp_hdl->max_stream_num - 1].desc.next = &mcp_hdl->in_streams[0].desc;
-
-    mcp_hdl->tx_desc = &mcp_hdl->out_streams[0].desc;
-    mcp_hdl->rx_desc = &mcp_hdl->in_streams[0].desc;
-    mcp_hdl->next_rx_desc_to_check = &mcp_hdl->in_streams[0].desc;
-    mcp_hdl->spinlock = (portMUX_TYPE)portMUX_INITIALIZER_UNLOCKED;
-    mcp_hdl->mcp_impl.sram_trans_align = config->sram_trans_align;
-    mcp_hdl->mcp_impl.psram_trans_align = config->psram_trans_align;
-    size_t trans_align = MAX(config->sram_trans_align, config->psram_trans_align);
-    mcp_hdl->max_dma_buffer_size = trans_align ? ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, trans_align) : DMA_DESCRIPTOR_BUFFER_MAX_SIZE;
-
-    // initialize implementation layer
-    ret = async_memcpy_impl_init(&mcp_hdl->mcp_impl);
-    ESP_GOTO_ON_ERROR(ret, err, TAG, "DMA M2M init failed");
-
-    ESP_LOGD(TAG, "installed memory to memory copy channel at %p", mcp_hdl);
-
-    *asmcp = mcp_hdl;
-
-    async_memcpy_impl_start(&mcp_hdl->mcp_impl, (intptr_t)&mcp_hdl->out_streams[0].desc, (intptr_t)&mcp_hdl->in_streams[0].desc);
-
-    return ESP_OK;
-err:
-    if (mcp_hdl) {
-        free(mcp_hdl);
-    }
-    if (asmcp) {
-        *asmcp = NULL;
-    }
-    return ret;
+    ESP_RETURN_ON_FALSE(asmcp, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
+    return asmcp->del(asmcp);
 }

-esp_err_t esp_async_memcpy_uninstall(async_memcpy_t asmcp)
+esp_err_t esp_async_memcpy(async_memcpy_handle_t asmcp, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args)
 {
-    esp_err_t ret = ESP_OK;
-    ESP_GOTO_ON_FALSE(asmcp, ESP_ERR_INVALID_ARG, err, TAG, "mcp handle can't be null");
-
-    async_memcpy_impl_stop(&asmcp->mcp_impl);
-    async_memcpy_impl_deinit(&asmcp->mcp_impl);
-    free(asmcp);
-err:
-    return ret;
+    ESP_RETURN_ON_FALSE(asmcp && dst && src && n, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
+    return asmcp->memcpy(asmcp, dst, src, n, cb_isr, cb_args);
 }

-esp_err_t esp_async_memcpy_new_etm_event(async_memcpy_t asmcp, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event)
+#if SOC_GDMA_SUPPORT_ETM
+esp_err_t esp_async_memcpy_new_etm_event(async_memcpy_handle_t asmcp, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event)
 {
-    ESP_RETURN_ON_FALSE(asmcp, ESP_ERR_INVALID_ARG, TAG, "mcp handle can't be null");
-    return async_memcpy_impl_new_etm_event(&asmcp->mcp_impl, event_type, out_event);
-}
-
-static int async_memcpy_prepare_receive(async_memcpy_t asmcp, void *buffer, size_t size, dma_descriptor_t **start_desc, dma_descriptor_t **end_desc)
-{
-    uint32_t prepared_length = 0;
-    uint8_t *buf = (uint8_t *)buffer;
-    dma_descriptor_t *desc = asmcp->rx_desc; // descriptor iterator
-    dma_descriptor_t *start = desc;
-    dma_descriptor_t *end = desc;
-
-    while (size > asmcp->max_dma_buffer_size) {
-        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
-            desc->dw0.suc_eof = 0;
-            desc->dw0.size = asmcp->max_dma_buffer_size;
-            desc->buffer = &buf[prepared_length];
-            desc = desc->next; // move to next descriptor
-            prepared_length += asmcp->max_dma_buffer_size;
-            size -= asmcp->max_dma_buffer_size;
-        } else {
-            // out of RX descriptors
-            goto _exit;
-        }
-    }
-    if (size) {
-        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
-            end = desc; // the last descriptor used
-            desc->dw0.suc_eof = 0;
-            desc->dw0.size = size;
-            desc->buffer = &buf[prepared_length];
-            desc = desc->next; // move to next descriptor
-            prepared_length += size;
-        } else {
-            // out of RX descriptors
-            goto _exit;
-        }
-    }
-
-_exit:
-    *start_desc = start;
-    *end_desc = end;
-    return prepared_length;
-}
-
-static int async_memcpy_prepare_transmit(async_memcpy_t asmcp, void *buffer, size_t len, dma_descriptor_t **start_desc, dma_descriptor_t **end_desc)
-{
-    uint32_t prepared_length = 0;
-    uint8_t *buf = (uint8_t *)buffer;
-    dma_descriptor_t *desc = asmcp->tx_desc; // descriptor iterator
-    dma_descriptor_t *start = desc;
-    dma_descriptor_t *end = desc;
-
-    while (len > asmcp->max_dma_buffer_size) {
-        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
-            desc->dw0.suc_eof = 0; // not the end of the transaction
-            desc->dw0.size = asmcp->max_dma_buffer_size;
-            desc->dw0.length = asmcp->max_dma_buffer_size;
-            desc->buffer = &buf[prepared_length];
-            desc = desc->next; // move to next descriptor
-            prepared_length += asmcp->max_dma_buffer_size;
-            len -= asmcp->max_dma_buffer_size;
-        } else {
-            // out of TX descriptors
-            goto _exit;
-        }
-    }
-    if (len) {
-        if (desc->dw0.owner != DMA_DESCRIPTOR_BUFFER_OWNER_DMA) {
-            end = desc;            // the last descriptor used
-            desc->dw0.suc_eof = 1; // end of the transaction
-            desc->dw0.size = len;
-            desc->dw0.length = len;
-            desc->buffer = &buf[prepared_length];
-            desc = desc->next; // move to next descriptor
-            prepared_length += len;
-        } else {
-            // out of TX descriptors
-            goto _exit;
-        }
-    }
-
-    *start_desc = start;
-    *end_desc = end;
-_exit:
-    return prepared_length;
-}
-
-static bool async_memcpy_get_next_rx_descriptor(async_memcpy_t asmcp, dma_descriptor_t *eof_desc, dma_descriptor_t **next_desc)
-{
-    dma_descriptor_t *next = asmcp->next_rx_desc_to_check;
-    // additional check, to avoid potential interrupt got triggered by mistake
-    if (next->dw0.owner == DMA_DESCRIPTOR_BUFFER_OWNER_CPU) {
-        asmcp->next_rx_desc_to_check = asmcp->next_rx_desc_to_check->next;
-        *next_desc = next;
-        // return if we need to continue
-        return eof_desc == next ? false : true;
-    }
-
-    *next_desc = NULL;
-    return false;
-}
-
-esp_err_t esp_async_memcpy(async_memcpy_t asmcp, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args)
-{
-    esp_err_t ret = ESP_OK;
-    dma_descriptor_t *rx_start_desc = NULL;
-    dma_descriptor_t *rx_end_desc = NULL;
-    dma_descriptor_t *tx_start_desc = NULL;
-    dma_descriptor_t *tx_end_desc = NULL;
-    size_t rx_prepared_size = 0;
-    size_t tx_prepared_size = 0;
-    ESP_GOTO_ON_FALSE(asmcp, ESP_ERR_INVALID_ARG, err, TAG, "mcp handle can't be null");
-    ESP_GOTO_ON_FALSE(async_memcpy_impl_is_buffer_address_valid(&asmcp->mcp_impl, src, dst), ESP_ERR_INVALID_ARG, err, TAG, "buffer address not valid: %p -> %p", src, dst);
-    ESP_GOTO_ON_FALSE(n <= asmcp->max_dma_buffer_size * asmcp->max_stream_num, ESP_ERR_INVALID_ARG, err, TAG, "buffer size too large");
-    if (asmcp->mcp_impl.sram_trans_align) {
-        ESP_GOTO_ON_FALSE(((n & (asmcp->mcp_impl.sram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, err, TAG, "copy size should align to %d bytes", asmcp->mcp_impl.sram_trans_align);
-    }
-    if (asmcp->mcp_impl.psram_trans_align) {
-        ESP_GOTO_ON_FALSE(((n & (asmcp->mcp_impl.psram_trans_align - 1)) == 0), ESP_ERR_INVALID_ARG, err, TAG, "copy size should align to %d bytes", asmcp->mcp_impl.psram_trans_align);
-    }
-
-    // Prepare TX and RX descriptor
-    portENTER_CRITICAL_SAFE(&asmcp->spinlock);
-    rx_prepared_size = async_memcpy_prepare_receive(asmcp, dst, n, &rx_start_desc, &rx_end_desc);
-    tx_prepared_size = async_memcpy_prepare_transmit(asmcp, src, n, &tx_start_desc, &tx_end_desc);
-    if (rx_start_desc && tx_start_desc && (rx_prepared_size == n) && (tx_prepared_size == n)) {
-        // register user callback to the last descriptor
-        async_memcpy_stream_t *mcp_stream = __containerof(rx_end_desc, async_memcpy_stream_t, desc);
-        mcp_stream->cb = cb_isr;
-        mcp_stream->cb_args = cb_args;
-        // restart RX firstly
-        dma_descriptor_t *desc = rx_start_desc;
-        while (desc != rx_end_desc) {
-            desc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-            desc = desc->next;
-        }
-        desc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-        asmcp->rx_desc = desc->next;
-        // restart TX secondly
-        desc = tx_start_desc;
-        while (desc != tx_end_desc) {
-            desc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-            desc = desc->next;
-        }
-        desc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA;
-        asmcp->tx_desc = desc->next;
-        async_memcpy_impl_restart(&asmcp->mcp_impl);
-    }
-    portEXIT_CRITICAL_SAFE(&asmcp->spinlock);
-
-    // It's unlikely that we have space for rx descriptor but no space for tx descriptor
-    // Both tx and rx descriptor should move in the same pace
-    ESP_GOTO_ON_FALSE(rx_prepared_size == n, ESP_FAIL, err, TAG, "out of rx descriptor");
-    ESP_GOTO_ON_FALSE(tx_prepared_size == n, ESP_FAIL, err, TAG, "out of tx descriptor");
-
-err:
-    return ret;
-}
-
-IRAM_ATTR void async_memcpy_isr_on_rx_done_event(async_memcpy_impl_t *impl)
-{
-    bool to_continue = false;
-    async_memcpy_stream_t *in_stream = NULL;
-    dma_descriptor_t *next_desc = NULL;
-    async_memcpy_context_t *asmcp = __containerof(impl, async_memcpy_context_t, mcp_impl);
-
-    // get the RX eof descriptor address
-    dma_descriptor_t *eof = (dma_descriptor_t *)impl->rx_eof_addr;
-    // traversal all unchecked descriptors
-    do {
-        portENTER_CRITICAL_ISR(&asmcp->spinlock);
-        // There is an assumption that the usage of rx descriptors are in the same pace as tx descriptors (this is determined by M2M DMA working mechanism)
-        // And once the rx descriptor is recycled, the corresponding tx desc is guaranteed to be returned by DMA
-        to_continue = async_memcpy_get_next_rx_descriptor(asmcp, eof, &next_desc);
-        portEXIT_CRITICAL_ISR(&asmcp->spinlock);
-        if (next_desc) {
-            in_stream = __containerof(next_desc, async_memcpy_stream_t, desc);
-            // invoke user registered callback if available
-            if (in_stream->cb) {
-                async_memcpy_event_t e = {0};
-                if (in_stream->cb(asmcp, &e, in_stream->cb_args)) {
-                    impl->isr_need_yield = true;
-                }
-                in_stream->cb = NULL;
-                in_stream->cb_args = NULL;
-            }
-        }
-    } while (to_continue);
+    ESP_RETURN_ON_FALSE(asmcp && out_event, ESP_ERR_INVALID_ARG, TAG, "invalid argument");
+    return asmcp->new_etm_event(asmcp, event_type, out_event);
 }
+#endif
--- a/components/esp_hw_support/dma/esp_async_memcpy_priv.h
+++ b/components/esp_hw_support/dma/esp_async_memcpy_priv.h
@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "esp_err.h"
+#include "esp_etm.h"
+#include "esp_async_memcpy.h"
+#include "soc/soc_caps.h"
+
+#define ALIGN_DOWN(val, align)  ((val) & ~((align) - 1))
+
+#define DEFAULT_TRANSACTION_QUEUE_LENGTH 4
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+    MCP_FSM_IDLE_WAIT, /// intermediate state, for state changes from others to IDLE
+    MCP_FSM_IDLE,
+    MCP_FSM_RUN_WAIT, /// intermediate state, for state changes from others to RUN
+    MCP_FSM_RUN,
+} async_memcpy_fsm_t;
+
+typedef struct async_memcpy_context_t async_memcpy_context_t;
+
+struct async_memcpy_context_t {
+    /// @brief Start a new async memcpy transaction
+    esp_err_t (*memcpy)(async_memcpy_context_t *ctx, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args);
+#if SOC_GDMA_SUPPORT_ETM
+    /// @brief Create ETM event handle of specific event type
+    esp_err_t (*new_etm_event)(async_memcpy_context_t *ctx, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event);
+#endif // SOC_GDMA_SUPPORT_ETM
+    /// @brief Delete async memcpy driver context
+    esp_err_t (*del)(async_memcpy_context_t *ctx);
+};
+
+#ifdef __cplusplus
+}
+#endif
--- a/components/esp_hw_support/include/esp_async_memcpy.h
+++ b/components/esp_hw_support/include/esp_async_memcpy.h
@ -1,29 +1,33 @@
 /*
- * SPDX-FileCopyrightText: 2020-2021 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

 #pragma once

+#include <stdint.h>
+#include <stdbool.h>
+#include "soc/soc_caps.h"
+#include "esp_err.h"
+#include "esp_etm.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif

-#include <stdint.h>
-#include <stdbool.h>
-#include "esp_err.h"
-#include "esp_etm.h"
-
 /**
- * @brief Type of async memcpy handle
- *
+ * @brief Async memory copy driver handle
 */
-typedef struct async_memcpy_context_t *async_memcpy_t;
+typedef struct async_memcpy_context_t *async_memcpy_handle_t;
+
+/** @cond */
+/// @brief legacy driver handle type
+typedef async_memcpy_handle_t async_memcpy_t;
+/** @endcond */

 /**
- * @brief Type of async memcpy event object
- *
+ * @brief Async memory copy event data
 */
 typedef struct {
    void *data; /*!< Event data */
@ -40,14 +44,13 @@ typedef struct {
 * @note User can call OS primitives (semaphore, mutex, etc) in the callback function.
 *       Keep in mind, if any OS primitive wakes high priority task up, the callback should return true.
 */
-typedef bool (*async_memcpy_isr_cb_t)(async_memcpy_t mcp_hdl, async_memcpy_event_t *event, void *cb_args);
+typedef bool (*async_memcpy_isr_cb_t)(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args);

 /**
 * @brief Type of async memcpy configuration
- *
 */
 typedef struct {
-    uint32_t backlog;          /*!< Maximum number of streams that can be handled simultaneously */
+    uint32_t backlog;          /*!< Maximum number of transactions that can be prepared in the background */
    size_t sram_trans_align;   /*!< DMA transfer alignment (both in size and address) for SRAM memory */
    size_t psram_trans_align;  /*!< DMA transfer alignment (both in size and address) for PSRAM memory */
    uint32_t flags;            /*!< Extra flags to control async memcpy feature */
@ -55,7 +58,6 @@ typedef struct {

 /**
 * @brief Default configuration for async memcpy
- *
 */
 #define ASYNC_MEMCPY_DEFAULT_CONFIG() \
    {                                 \
@ -65,36 +67,86 @@ typedef struct {
        .flags = 0,                   \
    }

+#if SOC_AHB_GDMA_SUPPORTED
 /**
- * @brief Install async memcpy driver
+ * @brief Install async memcpy driver, with AHB-GDMA as the backend
 *
 * @param[in] config Configuration of async memcpy
- * @param[out] asmcp Handle of async memcpy that returned from this API. If driver installation is failed, asmcp would be assigned to NULL.
+ * @param[out] mcp Returned driver handle
 * @return
 *      - ESP_OK: Install async memcpy driver successfully
 *      - ESP_ERR_INVALID_ARG: Install async memcpy driver failed because of invalid argument
 *      - ESP_ERR_NO_MEM: Install async memcpy driver failed because out of memory
 *      - ESP_FAIL: Install async memcpy driver failed because of other error
 */
-esp_err_t esp_async_memcpy_install(const async_memcpy_config_t *config, async_memcpy_t *asmcp);
+esp_err_t esp_async_memcpy_install_gdma_ahb(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp);
+#endif // SOC_AHB_GDMA_SUPPORTED
+
+#if SOC_AXI_GDMA_SUPPORTED
+/**
+ * @brief Install async memcpy driver, with AXI-GDMA as the backend
+ *
+ * @param[in] config Configuration of async memcpy
+ * @param[out] mcp Returned driver handle
+ * @return
+ *      - ESP_OK: Install async memcpy driver successfully
+ *      - ESP_ERR_INVALID_ARG: Install async memcpy driver failed because of invalid argument
+ *      - ESP_ERR_NO_MEM: Install async memcpy driver failed because out of memory
+ *      - ESP_FAIL: Install async memcpy driver failed because of other error
+ */
+esp_err_t esp_async_memcpy_install_gdma_axi(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp);
+#endif // SOC_AXI_GDMA_SUPPORTED
+
+#if SOC_CP_DMA_SUPPORTED
+/**
+ * @brief Install async memcpy driver, with CPDMA as the backend
+ *
+ * @note CPDMA is a CPU peripheral, aiming for memory copy.
+ *
+ * @param[in] config Configuration of async memcpy
+ * @param[out] mcp Returned driver handle
+ * @return
+ *      - ESP_OK: Install async memcpy driver successfully
+ *      - ESP_ERR_INVALID_ARG: Install async memcpy driver failed because of invalid argument
+ *      - ESP_ERR_NO_MEM: Install async memcpy driver failed because out of memory
+ *      - ESP_FAIL: Install async memcpy driver failed because of other error
+ */
+esp_err_t esp_async_memcpy_install_cpdma(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp);
+#endif // SOC_CP_DMA_SUPPORTED
+
+/**
+ * @brief Install async memcpy driver with the default DMA backend
+ *
+ * @note On chip with CPDMA support, CPDMA is the default choice.
+ *       On chip with AHB-GDMA support, AHB-GDMA is the default choice.
+ *
+ * @param[in] config Configuration of async memcpy
+ * @param[out] mcp Returned driver handle
+ * @return
+ *      - ESP_OK: Install async memcpy driver successfully
+ *      - ESP_ERR_INVALID_ARG: Install async memcpy driver failed because of invalid argument
+ *      - ESP_ERR_NO_MEM: Install async memcpy driver failed because out of memory
+ *      - ESP_FAIL: Install async memcpy driver failed because of other error
+ */
+esp_err_t esp_async_memcpy_install(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp);

 /**
 * @brief Uninstall async memcpy driver
 *
- * @param[in] asmcp Handle of async memcpy driver that returned from esp_async_memcpy_install
+ * @param[in] mcp Handle of async memcpy driver that returned from `esp_async_memcpy_install`
 * @return
 *      - ESP_OK: Uninstall async memcpy driver successfully
 *      - ESP_ERR_INVALID_ARG: Uninstall async memcpy driver failed because of invalid argument
 *      - ESP_FAIL: Uninstall async memcpy driver failed because of other error
 */
-esp_err_t esp_async_memcpy_uninstall(async_memcpy_t asmcp);
+esp_err_t esp_async_memcpy_uninstall(async_memcpy_handle_t mcp);

 /**
 * @brief Send an asynchronous memory copy request
 *
 * @note The callback function is invoked in interrupt context, never do blocking jobs in the callback.
 *
- * @param[in] asmcp Handle of async memcpy driver that returned from esp_async_memcpy_install
+ * @param[in] mcp Handle of async memcpy driver that returned from `esp_async_memcpy_install`
 * @param[in] dst Destination address (copy to)
 * @param[in] src Source address (copy from)
 * @param[in] n Number of bytes to copy
@ -105,8 +157,9 @@ esp_err_t esp_async_memcpy_uninstall(async_memcpy_t asmcp);
 *      - ESP_ERR_INVALID_ARG: Send memory copy request failed because of invalid argument
 *      - ESP_FAIL: Send memory copy request failed because of other error
 */
-esp_err_t esp_async_memcpy(async_memcpy_t asmcp, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args);
+esp_err_t esp_async_memcpy(async_memcpy_handle_t mcp, void *dst, void *src, size_t n, async_memcpy_isr_cb_t cb_isr, void *cb_args);

+#if SOC_GDMA_SUPPORT_ETM
 /**
 * @brief Async memory copy specific events that supported by the ETM module
 */
@ -119,17 +172,17 @@ typedef enum {
 *
 * @note The created ETM event object can be deleted later by calling `esp_etm_del_event`
 *
- * @param[in] asmcp Handle of async memcpy driver that returned from `esp_async_memcpy_install`
+ * @param[in] mcp Handle of async memcpy driver that returned from `esp_async_memcpy_install`
 * @param[in] event_type ETM event type
 * @param[out] out_event Returned ETM event handle
 * @return
- * @return
 *      - ESP_OK: Get ETM event successfully
 *      - ESP_ERR_INVALID_ARG: Get ETM event failed because of invalid argument
 *      - ESP_ERR_NOT_SUPPORTED: Get ETM event failed because the DMA hardware doesn't support ETM submodule
 *      - ESP_FAIL: Get ETM event failed because of other error
 */
-esp_err_t esp_async_memcpy_new_etm_event(async_memcpy_t asmcp, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event);
+esp_err_t esp_async_memcpy_new_etm_event(async_memcpy_handle_t mcp, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event);
+#endif // SOC_GDMA_SUPPORT_ETM

 #ifdef __cplusplus
 }
--- a/components/esp_hw_support/port/include/esp_async_memcpy_impl.h
+++ b/components/esp_hw_support/port/include/esp_async_memcpy_impl.h
@ -1,120 +0,0 @@
-/*
- * SPDX-FileCopyrightText: 2020-2021 Espressif Systems (Shanghai) CO LTD
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stdbool.h>
-#include "esp_err.h"
-#include "esp_intr_alloc.h"
-#include "esp_etm.h"
-#include "soc/soc_caps.h"
-#include "hal/dma_types.h"
-#include "freertos/FreeRTOS.h"
-#include "esp_async_memcpy.h"
-
-#if SOC_CP_DMA_SUPPORTED
-#include "hal/cp_dma_ll.h"
-#include "hal/cp_dma_hal.h"
-#elif SOC_GDMA_SUPPORTED
-#include "esp_private/gdma.h"
-#endif
-
-
-/**
- * @brief Type of async mcp implementation layer context
- *
- */
-typedef struct {
-#if SOC_CP_DMA_SUPPORTED
-    cp_dma_hal_context_t hal; // CP DMA hal
-    intr_handle_t intr; // CP DMA interrupt handle
-    portMUX_TYPE hal_lock; // CP DMA HAL level spin lock
-#elif SOC_GDMA_SUPPORTED
-    gdma_channel_handle_t tx_channel;
-    gdma_channel_handle_t rx_channel;
-#endif
-    intptr_t rx_eof_addr;
-    size_t sram_trans_align;
-    size_t psram_trans_align;
-    bool isr_need_yield;      // if current isr needs a yield for higher priority task
-} async_memcpy_impl_t;
-
-/**
- * @brief ISR callback function, invoked when RX done event triggered
- *
- * @param impl async mcp implementation layer context pointer
- */
-void async_memcpy_isr_on_rx_done_event(async_memcpy_impl_t *impl);
-
-/**
- * @brief Initialize async mcp implementation layer
- *
- * @param impl async mcp implementation layer context pointer
- * @return Always return ESP_OK
- */
-esp_err_t async_memcpy_impl_init(async_memcpy_impl_t *impl);
-
-/**
- * @brief Deinitialize async mcp implementation layer
- *
- * @param impl async mcp implementation layer context pointer
- * @return Always return ESP_OK
- */
-esp_err_t async_memcpy_impl_deinit(async_memcpy_impl_t *impl);
-
-/**
- * @brief Start async mcp (on implementation layer)
- *
- * @param impl async mcp implementation layer context pointer
- * @param outlink_base base descriptor address for TX DMA channel
- * @param inlink_base base descriptor address for RX DMA channel
- * @return Always return ESP_OK
- */
-esp_err_t async_memcpy_impl_start(async_memcpy_impl_t *impl, intptr_t outlink_base, intptr_t inlink_base);
-
-/**
- * @brief Stop async mcp (on implementation layer)
- *
- * @param impl async mcp implementation layer context pointer
- * @return Always return ESP_OK
- */
-esp_err_t async_memcpy_impl_stop(async_memcpy_impl_t *impl);
-
-/**
- * @brief Restart async mcp DMA engine
- *
- * @param impl async mcp implementation layer context pointer
- * @return Always return ESP_OK
- */
-esp_err_t async_memcpy_impl_restart(async_memcpy_impl_t *impl);
-
-/**
- * @brief Get ETM Event handle
- *
- * @param impl async mcp implementation layer context pointer
- * @param event_type ETM event type
- * @param out_event Returned ETM event handle
- * @return ESP_OK on success, ESP_ERR_NOT_SUPPORTED if not supported in hardware, otherwise failed
- */
-esp_err_t async_memcpy_impl_new_etm_event(async_memcpy_impl_t *impl, async_memcpy_etm_event_t event_type, esp_etm_event_handle_t *out_event);
-
-/**
- * @brief check if buffer address is valid
- * @note This is related to underlying target (e.g. on esp32-s2, only buffer located in SRAM is supported)
- *
- * @param impl async mcp implementation layer context pointer
- * @param src Source buffer address
- * @param dst Destination buffer address
- * @return True if both address are valid
- */
-bool async_memcpy_impl_is_buffer_address_valid(async_memcpy_impl_t *impl, void *src, void *dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c
+++ b/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: 2021-2022 Espressif Systems (Shanghai) CO LTD
+ * SPDX-FileCopyrightText: 2021-2023 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */
@ -28,6 +28,7 @@
 typedef struct {
    uint32_t seed;
    size_t buffer_size;
+    size_t copy_size;
    uint8_t *src_buf;
    uint8_t *dst_buf;
    uint8_t *from_addr;
@ -43,62 +44,56 @@ static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_contex
    srand(test_context->seed);
    printf("allocating memory buffer...\r\n");
    size_t buffer_size = test_context->buffer_size;
+    size_t copy_size = buffer_size;
    uint8_t *src_buf = NULL;
    uint8_t *dst_buf = NULL;
    uint8_t *from_addr = NULL;
    uint8_t *to_addr = NULL;
 #if CONFIG_SPIRAM && SOC_AHB_GDMA_SUPPORT_PSRAM
    if (test_context->src_in_psram) {
-        src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_SPIRAM);
+        src_buf = heap_caps_aligned_alloc(test_context->align, buffer_size, MALLOC_CAP_SPIRAM);
    } else {
-        src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+        src_buf = heap_caps_aligned_alloc(test_context->align, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
    }
    if (test_context->dst_in_psram) {
-        dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_SPIRAM);
+        dst_buf = heap_caps_aligned_alloc(test_context->align, buffer_size, MALLOC_CAP_SPIRAM);
    } else {
-        dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+        dst_buf = heap_caps_aligned_alloc(test_context->align, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
    }
 #else
-    src_buf = heap_caps_malloc(buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
-    dst_buf = heap_caps_calloc(1, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    src_buf = heap_caps_aligned_alloc(test_context->align, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    dst_buf = heap_caps_aligned_alloc(test_context->align, buffer_size, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
 #endif
    TEST_ASSERT_NOT_NULL_MESSAGE(src_buf, "allocate source buffer failed");
    TEST_ASSERT_NOT_NULL_MESSAGE(dst_buf, "allocate destination buffer failed");
-    // address alignment
-    from_addr = (uint8_t *)ALIGN_UP((uint32_t)(src_buf), test_context->align);
-    to_addr = (uint8_t *)ALIGN_UP((uint32_t)(dst_buf), test_context->align);
-    uint8_t gap = MAX(from_addr - src_buf, to_addr - dst_buf);
-    buffer_size -= gap;
-    // size alignment
-    buffer_size = ALIGN_DOWN(buffer_size, test_context->align);
    // adding extra offset
-    from_addr += test_context->offset;
-    to_addr += test_context->offset;
-    buffer_size -= test_context->offset;
+    from_addr = src_buf + test_context->offset;
+    to_addr = dst_buf + test_context->offset;
+    copy_size -= test_context->offset;

-    printf("...size %zu Bytes, src@%p, dst@%p\r\n", buffer_size, from_addr, to_addr);
+    printf("...to copy size %zu Bytes, from @%p, to @%p\r\n", copy_size, from_addr, to_addr);
    printf("fill src buffer with random data\r\n");
-    for (int i = 0; i < buffer_size; i++) {
+    for (int i = 0; i < copy_size; i++) {
        from_addr[i] = rand() % 256;
    }
-    // return value
-    test_context->buffer_size = buffer_size;
+
+    // save context
+    test_context->copy_size = copy_size;
    test_context->src_buf = src_buf;
    test_context->dst_buf = dst_buf;
    test_context->from_addr = from_addr;
    test_context->to_addr = to_addr;
 }

-static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t buffer_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr)
+static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t copy_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr)
 {
    srand(seed);
-    for (int i = 0; i < buffer_size; i++) {
-        // check if source date has been copied to destination and source data not broken
-        TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, to_addr[i], "destination data doesn't match generator data");
+    // check if source date has been copied to destination and source data not broken
+    for (int i = 0; i < copy_size; i++) {
+        TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, from_addr[i], "source data doesn't match generator data");
    }
    srand(seed);
-    for (int i = 0; i < buffer_size; i++) {
-        // check if source data has been copied to destination
+    for (int i = 0; i < copy_size; i++) {
        TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, to_addr[i], "destination data doesn't match source data");
    }
    free(src_buf);
@ -108,13 +103,13 @@ static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t buff
 TEST_CASE("memory copy the same buffer with different content", "[async mcp]")
 {
    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    config.backlog = 1;
-    async_memcpy_t driver = NULL;
+    async_memcpy_handle_t driver = NULL;
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));
-    uint8_t sbuf[256] = {0};
-    uint8_t dbuf[256] = {0};
+    uint8_t *sbuf = heap_caps_malloc(256, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    uint8_t *dbuf = heap_caps_malloc(256, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
    for (int j = 0; j < 20; j++) {
        TEST_ESP_OK(esp_async_memcpy(driver, dbuf, sbuf, 256, NULL, NULL));
+        vTaskDelay(pdMS_TO_TICKS(10));
        for (int i = 0; i < 256; i++) {
            if (sbuf[i] != dbuf[i]) {
                printf("location[%d]:s=%d,d=%d\r\n", i, sbuf[i], dbuf[i]);
@ -125,15 +120,12 @@ TEST_CASE("memory copy the same buffer with different content", "[async mcp]")
        }
    }
    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+    free(sbuf);
+    free(dbuf);
 }

-TEST_CASE("memory copy by DMA one by one", "[async mcp]")
+static void test_memory_copy_one_by_one(async_memcpy_handle_t driver)
 {
-    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    config.backlog = 4;
-    async_memcpy_t driver = NULL;
-    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));
-
    uint32_t test_buffer_len[] = {256, 512, 1024, 2048, 4096, 5011};
    memcpy_testbench_context_t test_context = {
        .align = 4,
@ -144,20 +136,79 @@ TEST_CASE("memory copy by DMA one by one", "[async mcp]")
        for (int off = 0; off < 4; off++) {
            test_context.buffer_size = test_buffer_len[i];
            test_context.seed = i;
+            test_context.offset = off;
            async_memcpy_setup_testbench(&test_context);
-            TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, NULL, NULL));
-            async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
-            vTaskDelay(pdMS_TO_TICKS(100));
+
+            TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, NULL, NULL));
+            vTaskDelay(pdMS_TO_TICKS(10));
+            async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf,
+                                                    test_context.dst_buf, test_context.from_addr, test_context.to_addr);
        }
    }
+}

+TEST_CASE("memory copy by DMA one by one", "[async mcp]")
+{
+    async_memcpy_config_t config = {
+        .backlog = 4,
+    };
+    async_memcpy_handle_t driver = NULL;
+
+#if SOC_AHB_GDMA_SUPPORTED
+    printf("Testing memory by AHB GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&config, &driver));
+    test_memory_copy_one_by_one(driver);
    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AHB_GDMA_SUPPORTED
+
+#if SOC_AXI_GDMA_SUPPORTED
+    printf("Testing memory by AXI GDMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&config, &driver));
+    test_memory_copy_one_by_one(driver);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_AXI_GDMA_SUPPORTED
+
+#if SOC_CP_DMA_SUPPORTED
+    printf("Testing memory by CP DMA\r\n");
+    TEST_ESP_OK(esp_async_memcpy_install_cpdma(&config, &driver));
+    test_memory_copy_one_by_one(driver);
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+#endif // SOC_CP_DMA_SUPPORTED
+
+}
+
+static bool test_async_memcpy_cb_v1(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
+{
+    SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args;
+    BaseType_t high_task_wakeup = pdFALSE;
+    xSemaphoreGiveFromISR(sem, &high_task_wakeup);
+    return high_task_wakeup == pdTRUE;
+}
+
+TEST_CASE("memory copy done callback", "[async mcp]")
+{
+    async_memcpy_config_t config = {
+        // all default
+    };
+    async_memcpy_handle_t driver = NULL;
+    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));
+
+    uint8_t *src_buf = heap_caps_malloc(256, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+    uint8_t *dst_buf = heap_caps_malloc(256, MALLOC_CAP_8BIT | MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL);
+
+    SemaphoreHandle_t sem = xSemaphoreCreateBinary();
+    TEST_ESP_OK(esp_async_memcpy(driver, dst_buf, src_buf, 256, test_async_memcpy_cb_v1, sem));
+    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
+    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
+    free(src_buf);
+    free(dst_buf);
+    vSemaphoreDelete(sem);
 }

 TEST_CASE("memory copy by DMA on the fly", "[async mcp]")
 {
    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
-    async_memcpy_t driver = NULL;
+    async_memcpy_handle_t driver = NULL;
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));

    uint32_t test_buffer_len[] = {512, 1024, 2048, 4096, 5011};
@ -172,10 +223,10 @@ TEST_CASE("memory copy by DMA on the fly", "[async mcp]")
        async_memcpy_setup_testbench(&test_context[i]);
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].buffer_size, NULL, NULL));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL));
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_context[i].buffer_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
+        async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
    }

    // Non-aligned case
@ -186,10 +237,10 @@ TEST_CASE("memory copy by DMA on the fly", "[async mcp]")
        async_memcpy_setup_testbench(&test_context[i]);
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].buffer_size, NULL, NULL));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL));
    }
    for (int i = 0; i < sizeof(test_buffer_len) / sizeof(test_buffer_len[0]); i++) {
-        async_memcpy_verify_and_clear_testbench(i, test_context[i].buffer_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
+        async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr);
    }

    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
@ -198,7 +249,7 @@ TEST_CASE("memory copy by DMA on the fly", "[async mcp]")
 #define TEST_ASYNC_MEMCPY_BENCH_COUNTS   (16)
 static int s_count = 0;

-static IRAM_ATTR bool test_async_memcpy_isr_cb(async_memcpy_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
+static IRAM_ATTR bool test_async_memcpy_isr_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
 {
    SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args;
    BaseType_t high_task_wakeup = pdFALSE;
@ -217,7 +268,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    config.backlog = (buffer_size / DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1) * TEST_ASYNC_MEMCPY_BENCH_COUNTS;
    config.sram_trans_align = 4;   // at least 4 bytes aligned for SRAM transfer
    config.psram_trans_align = 64; // at least 64 bytes aligned for PSRAM transfer
-    async_memcpy_t driver = NULL;
+    async_memcpy_handle_t driver = NULL;
    int64_t elapse_us = 0;
    float throughput = 0.0;
    TEST_ESP_OK(esp_async_memcpy_install(&config, &driver));
@ -233,7 +284,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    s_count = 0;
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
    }
    // wait for done semaphore
    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
@ -247,7 +298,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    elapse_us = ccomp_timer_stop();
    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);

 #if CONFIG_SPIRAM && SOC_AHB_GDMA_SUPPORT_PSRAM
    // 2. PSRAM->PSRAM
@ -257,7 +308,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    s_count = 0;
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
    }
    // wait for done semaphore
    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
@ -271,7 +322,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    elapse_us = ccomp_timer_stop();
    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);

    // 3. PSRAM->SRAM
    test_context.src_in_psram = true;
@ -280,7 +331,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    s_count = 0;
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
    }
    // wait for done semaphore
    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
@ -294,7 +345,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    elapse_us = ccomp_timer_stop();
    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);

    // 4. SRAM->PSRAM
    test_context.src_in_psram = false;
@ -303,7 +354,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    s_count = 0;
    ccomp_timer_start();
    for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) {
-        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.buffer_size, test_async_memcpy_isr_cb, sem));
+        TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem));
    }
    // wait for done semaphore
    TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000)));
@ -317,7 +368,7 @@ static void memcpy_performance_test(uint32_t buffer_size)
    elapse_us = ccomp_timer_stop();
    throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us;
    IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size);
-    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.buffer_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
+    async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr);
 #endif

    TEST_ESP_OK(esp_async_memcpy_uninstall(driver));
--- a/components/esp_hw_support/test_apps/dma/pytest_dma.py
+++ b/components/esp_hw_support/test_apps/dma/pytest_dma.py
@ -6,7 +6,6 @@ from pytest_embedded import Dut


@pytest.mark.esp32s2
-@pytest.mark.esp32s3
@pytest.mark.esp32c2
@pytest.mark.esp32c3
@pytest.mark.esp32c6
@ -21,3 +20,16 @@ from pytest_embedded import Dut
 )
 def test_dma(dut: Dut) -> None:
    dut.run_all_single_board_cases()
+
+
+@pytest.mark.esp32s3
+@pytest.mark.octal_psram
+@pytest.mark.parametrize(
+    'config',
+    [
+        'release',
+    ],
+    indirect=True,
+)
+def test_dma_psram(dut: Dut) -> None:
+    dut.run_all_single_board_cases()
--- a/components/esp_hw_support/test_apps/dma/sdkconfig.defaults.esp32s3
+++ b/components/esp_hw_support/test_apps/dma/sdkconfig.defaults.esp32s3
@ -0,0 +1,3 @@
+CONFIG_SPIRAM=y
+CONFIG_SPIRAM_MODE_OCT=y
+CONFIG_SPIRAM_SPEED_80M=y
--- a/components/esp_hw_support/test_apps/etm/main/test_gdma_etm.c
+++ b/components/esp_hw_support/test_apps/etm/main/test_gdma_etm.c
@ -45,7 +45,7 @@ TEST_CASE("async_memcpy_eof_event", "[etm]")
    TEST_ESP_OK(gpio_set_level(output_gpio, 1));

    printf("install async memcpy context\r\n");
-    async_memcpy_t mcp_ctx = NULL;
+    async_memcpy_handle_t mcp_ctx = NULL;
    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
    TEST_ESP_OK(esp_async_memcpy_install(&config, &mcp_ctx));

--- a/components/hal/linker.lf
+++ b/components/hal/linker.lf
@ -28,7 +28,7 @@ entries:
        systimer_hal (noflash)
    if TWAI_ISR_IN_IRAM = y:
        twai_hal_iram (noflash)
-    if IDF_TARGET_ESP32 = n:
+    if SOC_GPSPI_SUPPORTED = y && IDF_TARGET_ESP32 = n:
        spi_flash_hal_gpspi (noflash)
    if SOC_PMU_SUPPORTED = y:
        pmu_hal (noflash)
--- a/components/heap/multi_heap_platform.h
+++ b/components/heap/multi_heap_platform.h
@ -19,14 +19,14 @@ typedef portMUX_TYPE multi_heap_lock_t;
   we need to use portmux spinlocks here not RTOS mutexes */
 #define MULTI_HEAP_LOCK(PLOCK) do {                         \
        if((PLOCK) != NULL) {                               \
-            portENTER_CRITICAL((PLOCK));                    \
+            portENTER_CRITICAL_SAFE((PLOCK));               \
        }                                                   \
    } while(0)


 #define MULTI_HEAP_UNLOCK(PLOCK) do {                       \
        if ((PLOCK) != NULL) {                              \
-            portEXIT_CRITICAL((PLOCK));                     \
+            portEXIT_CRITICAL_SAFE((PLOCK));                \
        }                                                   \
    } while(0)

--- a/components/soc/esp32p4/include/soc/Kconfig.soc_caps.in
+++ b/components/soc/esp32p4/include/soc/Kconfig.soc_caps.in
@ -23,6 +23,10 @@ config SOC_GPTIMER_SUPPORTED
    bool
    default y

+config SOC_ASYNC_MEMCPY_SUPPORTED
+    bool
+    default y
+
 config SOC_SUPPORTS_SECURE_DL_MODE
    bool
    default y
@ -227,6 +231,10 @@ config SOC_GDMA_PAIRS_PER_GROUP_MAX
    int
    default 3

+config SOC_AXI_GDMA_SUPPORT_PSRAM
+    bool
+    default y
+
 config SOC_ETM_GROUPS
    int
    default 1
--- a/components/soc/esp32p4/include/soc/soc_caps.h
+++ b/components/soc/esp32p4/include/soc/soc_caps.h
@ -38,7 +38,7 @@
 // #define SOC_TWAI_SUPPORTED              1  //TODO: IDF-7470
 // #define SOC_ETM_SUPPORTED               1  //TODO: IDF-7478
 // #define SOC_PARLIO_SUPPORTED            1  //TODO: IDF-7471, TODO: IDF-7472
-// #define SOC_ASYNC_MEMCPY_SUPPORTED      1
+#define SOC_ASYNC_MEMCPY_SUPPORTED      1
 // disable usb serial jtag for esp32p4, current image does not support
 // #define SOC_USB_SERIAL_JTAG_SUPPORTED   1  //TODO: IDF-7496
 // #define SOC_TEMP_SENSOR_SUPPORTED       1  //TODO: IDF-7482
@ -161,6 +161,7 @@
 #define SOC_AHB_GDMA_VERSION            2
 #define SOC_GDMA_NUM_GROUPS_MAX         2
 #define SOC_GDMA_PAIRS_PER_GROUP_MAX    3
+#define SOC_AXI_GDMA_SUPPORT_PSRAM      1
 // #define SOC_GDMA_SUPPORT_ETM            1  // Both AHB-DMA and AXI-DMA supports ETM  //TODO: IDF-7478

 /*-------------------------- ETM CAPS --------------------------------------*/
--- a/docs/en/api-reference/system/async_memcpy.rst
+++ b/docs/en/api-reference/system/async_memcpy.rst
@ -1,60 +1,67 @@
-The Async Memcpy API
-====================
+Asynchronous Memory Copy
+========================

 Overview
 --------

-{IDF_TARGET_NAME} has a DMA engine which can help to offload internal memory copy operations from the CPU in a asynchronous way.
+{IDF_TARGET_NAME} has a DMA engine which can help to offload internal memory copy operations from the CPU in an asynchronous way.

-The async memcpy API wraps all DMA configurations and operations, the signature of :cpp:func:`esp_async_memcpy` is almost the same to the standard libc one.
+The async memcpy API wraps all DMA configurations and operations, the signature of :cpp:func:`esp_async_memcpy` is almost the same to the standard libc ``memcpy`` function.

-Thanks to the benefit of the DMA, we do not have to wait for each memory copy to be done before we issue another memcpy request. By the way, it is still possible to know when memcpy is finished by listening in the memcpy callback function.
+The DMA allows multiple memory copy requests to be queued up before the first one is completed, which allows overlap of computation and memory copy. By the way, it is still possible to know the exact time when a memory copy request is completed by registering an event callback.

-.. only:: esp32s2
+.. only:: SOC_AHB_GDMA_SUPPORT_PSRAM

-    .. note::
-        
-        Memory copy from/to external PSRAM is not supported on ESP32-S2, :cpp:func:`esp_async_memcpy` will abort returning an error if buffer address is not in SRAM.
+    If the async memcpy is constructed upon the AHB GDMA, it is also possible to copy data from/to PSRAM with a proper alignment.
+
+.. only:: SOC_AXI_GDMA_SUPPORT_PSRAM
+
+    If the async memcpy is constructed upon the AXI GDMA, it is also possible to copy data from/to PSRAM with a proper alignment.

 Configure and Install Driver
 ----------------------------

-:cpp:func:`esp_async_memcpy_install` is used to install the driver with user's configuration. Please note that async memcpy has to be called with the handle returned from :cpp:func:`esp_async_memcpy_install`.
+There are several ways to install the async memcpy driver, depending on the underlying DMA engine.
+
+.. list::
+
+    :SOC_CP_DMA_SUPPORTED: - :cpp:func:`esp_async_memcpy_install_cpdma` is used to install the async memcpy driver based on the CP DMA engine.
+    :SOC_AHB_GDMA_SUPPORTED: - :cpp:func:`esp_async_memcpy_install_gdma_ahb` is used to install the async memcpy driver based on the AHB GDMA engine.
+    :SOC_AXI_GDMA_SUPPORTED: - :cpp:func:`esp_async_memcpy_install_gdma_axi` is used to install the async memcpy driver based on the AXI GDMA engine.
+    - :cpp:func:`esp_async_memcpy_install` is a generic API to install the async memcpy driver with a default DMA engine. If the SOC has the CP_DMA engine, the default DMA engine is CP_DMA. Otherwise, the default DMA engine is AHB_GDMA.

 Driver configuration is described in :cpp:type:`async_memcpy_config_t`:

-* :cpp:member:`backlog`: This is used to configure the maximum number of DMA operations being processed at the same time.
+* :cpp:member:`backlog`: This is used to configure the maximum number of memory copy transactions that can be queued up before the first one is completed. If this field is set to zero, then the default value (i.e., 4) will be applied.
 * :cpp:member:`sram_trans_align`: Declare SRAM alignment for both data address and copy size, set to zero if the data has no restriction in alignment. If set to a quadruple value (i.e., 4X), the driver will enable the burst mode internally, which is helpful for some performance related application.
 * :cpp:member:`psram_trans_align`: Declare PSRAM alignment for both data address and copy size. User has to give it a valid value (only 16, 32, 64 are supported) if the destination of memcpy is located in PSRAM. The default alignment (i.e., 16) will be applied if it is set to zero. Internally, the driver configures the size of block used by DMA to access PSRAM, according to the alignment.
 * :cpp:member:`flags`: This is used to enable some special driver features.

-:c:macro:`ASYNC_MEMCPY_DEFAULT_CONFIG` provides a default configuration, which specifies the backlog to 8.
-
 .. code-block:: c

 ::

    async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG();
    // update the maximum data stream supported by underlying DMA engine
-    config.backlog = 16;
-    async_memcpy_t driver = NULL;
-    ESP_ERROR_CHECK(esp_async_memcpy_install(&config, &driver)); // install driver, return driver handle
+    config.backlog = 8;
+    async_memcpy_handle_t driver = NULL;
+    ESP_ERROR_CHECK(esp_async_memcpy_install(&config, &driver)); // install driver with default DMA engine

 Send Memory Copy Request
 ------------------------

 :cpp:func:`esp_async_memcpy` is the API to send memory copy request to DMA engine. It must be called after driver is installed successfully. This API is thread safe, so it can be called from different tasks.

-Different from the libc version of ``memcpy``, user should also pass a callback to :cpp:func:`esp_async_memcpy`, if it is necessary to be notified when the memory copy is done. The callback is executed in the ISR context, make sure you does not violate the restriction applied to ISR handler.
+Different from the libc version of ``memcpy``, you can optionally pass a callback to :cpp:func:`esp_async_memcpy`, so that you can be notified when the memory copy is finished. Note, the callback is executed in the ISR context, please make sure you will not call any blocking functions in the callback.

-Besides that, the callback function should reside in IRAM space by applying ``IRAM_ATTR`` attribute. The prototype of the callback function is :cpp:type:`async_memcpy_isr_cb_t`, please note that, the callback function should return true if it wakes up a high priority task by some API like :cpp:func:`xSemaphoreGiveFromISR`.
+The prototype of the callback function is :cpp:type:`async_memcpy_isr_cb_t`. The callback function should only return true if it wakes up a high priority task by RTOS APIs like :cpp:func:`xSemaphoreGiveFromISR`.

 .. code-block:: c

 ::

    // Callback implementation, running in ISR context
-    static IRAM_ATTR bool my_async_memcpy_cb(async_memcpy_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
+    static bool my_async_memcpy_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args)
    {
        SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args;
        BaseType_t high_task_wakeup = pdFALSE;
@ -70,10 +77,10 @@ Besides that, the callback function should reside in IRAM space by applying ``IR
    // Do something else here
    xSemaphoreTake(my_semaphore, portMAX_DELAY); // Wait until the buffer copy is done

-Uninstall Driver (Optional)
---------------------------
+Uninstall Driver
+----------------

-:cpp:func:`esp_async_memcpy_uninstall` is used to uninstall asynchronous memcpy driver. It is not necessary to uninstall the driver after each memcpy operation. If you know your application will not use this driver anymore, then this API can recycle the memory for you.
+:cpp:func:`esp_async_memcpy_uninstall` is used to uninstall asynchronous memcpy driver. It is not necessary to uninstall the driver after each memcpy operation. If you know your application will not use this driver anymore, then this API can recycle the memory and other hardware resources for you.

 .. only:: SOC_ETM_SUPPORTED and SOC_GDMA_SUPPORT_ETM