/*****************************************************************************
 *
 *    Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
 *
 *    The material in this file is confidential and contains trade secrets
 *    of Sophgo Technologies Inc. This is proprietary information owned by
 *    Sophgo Technologies Inc. No part of this work may be disclosed,
 *    reproduced, copied, transmitted, or used in any way for any purpose,
 *    without the express written permission of Sophgo Technologies Inc.
 *
 *****************************************************************************/

/*****************************************************************************
 * BMRuntime Interface is mainly for inference.
 * Also we can use it for device computation from BMLang programming.
 * Note: please use interface from bmlib_runtime.h for device memory operation.
 ****************************************************************************/

#ifndef BMRUNTIME_INTERFACE_H_
#define BMRUNTIME_INTERFACE_H_

#include "bmdef.h"

#ifdef _WIN32
#define DECL_EXPORT _declspec(dllexport)
#define DECL_IMPORT _declspec(dllimport)
#else
#define DECL_EXPORT
#define DECL_IMPORT
#endif

#if defined(__cplusplus)
extern "C" {
#endif

/* --------------------------------------------------------------------------*/
/* interface for basic data type */

/* get data type byte size */
DECL_EXPORT size_t bmrt_data_type_size(bm_data_type_t dtype);

/*
dims array to bm_shape_t,
shape and dims should not be NULL, num_dims should not be larger than BM_MAX_DIMS_NUM */
DECL_EXPORT void bmrt_shape(bm_shape_t* shape, const int* dims, int num_dims);

/*
number of shape elements, shape should not be NULL and num_dims should not large than
BM_MAX_DIMS_NUM */
DECL_EXPORT uint64_t bmrt_shape_count(const bm_shape_t* shape);

/* compare whether two shape is same */
DECL_EXPORT bool bmrt_shape_is_same(const bm_shape_t* left, const bm_shape_t* right);

/*
fill a tensor with data type and shape, and st_mode = 0 as default.
tensor and p_bmrt should not be NULL, shape count should not be 0.
it will alloc device mem to tensor->device_mem, so user should bmrt_free_device(p_bmrt,
tensor->device_mem) to free it.*/
DECL_EXPORT bool bmrt_tensor(bm_tensor_t* tensor, void* p_bmrt, bm_data_type_t dtype, bm_shape_t shape);

/*
fill a tensor with data type and shape, and st_mode = 0 as default.
tensor and p_bmrt should not be NULL, shape count should not be 0.
it will alloc device mem to tensor->device_mem on devid-th device.*/
DECL_EXPORT bool bmrt_tensor_ex(bm_tensor_t* tensor, void* p_bmrt, int devid, bm_data_type_t dtype, bm_shape_t shape);

/* fill a tensor with device mem existed, tensor byte size should not large than device mem size */
DECL_EXPORT void bmrt_tensor_with_device(bm_tensor_t* tensor, bm_device_mem_t device_mem,
                             bm_data_type_t dtype, bm_shape_t shape);

/* get tensor bytes size, tensor should not be NULL */
DECL_EXPORT size_t bmrt_tensor_bytesize(const bm_tensor_t* tensor);

/* get tensor mem size allocated in device mem, tensor should not be NULL */
DECL_EXPORT size_t bmrt_tensor_device_size(const bm_tensor_t* tensor);

/* print net info for debug */
DECL_EXPORT void bmrt_print_network_info(const bm_net_info_t* net_info);

/* --------------------------------------------------------------------------*/
/**
 * @name    bmrt_create
 * @brief   To create the bmruntime with bm_handle.
 * @ingroup bmruntime
 *
 * This API creates the bmruntime. It returns a void* pointer which is the pointer
 * of bmruntime. Device id is set when get bm_handle;
 *
 * @param [in] bm_handle     bm handle. It must be initialized by using bmlib.
 *
 * @retval void* the pointer of bmruntime
 */
DECL_EXPORT void* bmrt_create(bm_handle_t bm_handle);

/* --------------------------------------------------------------------------*/
/**
 * @name    bmrt_create_ex
 * @brief   To create the bmruntime with one or more bm_handle.
 * @ingroup bmruntime
 *
 * This API creates the bmruntime. It returns a void* pointer which is the pointer
 * of bmruntime.
 *
 * @param [in] bm_handles   bm handles. They must be initialized by using bmlib.
 * @param [in] num_handles  number of bm_handles.
 *
 * @retval void* the pointer of bmruntime
 */
DECL_EXPORT void *bmrt_create_ex(bm_handle_t *bm_handles, int num_handles);

/**
 * @name    bmrt_destroy
 * @brief   To destroy the bmruntime pointer
 * @ingroup bmruntime
 *
 * This API destroy the bmruntime.
 *
 * @param [in]     p_bmrt        Bmruntime that had been created
 */
DECL_EXPORT void bmrt_destroy(void* p_bmrt);

/**
 * @name    bmrt_get_bm_handle
 * @brief   To get the BM runtime context.
 * @ingroup bmruntime
 *
 * This API get the BM runtime context for using BMDNN, BMCV or BMLIB
 *
 * @param [in]     p_bmrt        Bmruntime that had been created
 */
DECL_EXPORT void * bmrt_get_bm_handle(void* p_bmrt);

/**
 * @name    bmrt_load_bmodel
 * @brief   To load the bmodel which is created by BM compiler
 * @ingroup bmruntime
 *
 * This API is to load bmodel created by BM compiler.
 * After loading bmodel, we can run the inference of neuron network.
 *
 * @param   [in]   p_bmrt        Bmruntime that had been created
 * @param   [in]   bmodel_path   Bmodel file directory.
 *
 * @retval true    Load context sucess.
 * @retval false   Load context failed.
 */
DECL_EXPORT bool bmrt_load_bmodel(void* p_bmrt, const char *bmodel_path);

/**
 * @name    bmrt_load_bmodel_data
 * @brief   To load the bmodel which is created by BM compiler from buffer
 * @ingroup bmruntime
 *
 * This API is to load bmodel created by BM compiler.
 * After loading bmodel, we can run the inference of neuron network.
 * Different with bmrt_load_bmodel, bmodel is the data in host memory.
 *
 * @param   [in]   p_bmrt        Bmruntime that had been created
 * @param   [in]   bmodel_data   Bmodel data pointer to buffer
 * @param   [in]   size          Bmodel data size
 *
 * @retval true    Load context sucess.
 * @retval false   Load context failed.
 */
DECL_EXPORT bool bmrt_load_bmodel_data(void* p_bmrt, const void * bmodel_data, size_t size);

/**
 * @name    bmrt_show_neuron_network
 * @brief   To print the name of all neuron network
 * @ingroup bmruntime
 *
 * @param [in]     p_bmrt         Bmruntime that had been created
 */
DECL_EXPORT void bmrt_show_neuron_network(void* p_bmrt);

/**
 * @name    bmrt_get_network_number
 * @brief   To get the number of neuron network in the bmruntime
 * @ingroup bmruntime
 *
 * @param [in]     p_bmrt         Bmruntime that had been created
 *
 * @retval  int value     The number of neuron networks.
 */
DECL_EXPORT int bmrt_get_network_number(void* p_bmrt);

/**
 * @name    bmrt_get_network_names
 * @brief   To get the names of all neuron network in the bmruntime
 * @ingroup bmruntime
 *
 * @param [in]     p_bmrt         Bmruntime that had been created
 * @param [out]    network_names  The names of all neuron networks. It should be declare as (const char** networks_ = NULL),
 *                                and use as the param &networks_. After this API, user need to free(networks_) if user
 *                                do not need it.
 */
DECL_EXPORT void bmrt_get_network_names(void* p_bmrt, const char*** network_names);

/**
 * @name    bmrt_get_network_info
 * @brief   To get network info by net name
 * @ingroup bmruntime
 *
 * @param [in]     p_bmrt         Bmruntime that had been created
 * @param [in]     net_name       Network name
 *
 * @retval  bm_net_info_t*        Pointer to net info, needn't free by user; if net name not found, will return NULL.
 */
DECL_EXPORT const bm_net_info_t* bmrt_get_network_info(void* p_bmrt, const char* net_name);

/**
 * @name    bmrt_launch_tensor
 * @brief   To launch the inference of the neuron network with setting input tensors
 * @ingroup bmruntime
 *
 * This API supports the neuron nework that is static-compiled or dynamic-compiled
 * After calling this API, inference on TPU is launched. And the CPU program will not
 * be blocked. bm_thread_sync should be called to make sure inference finished.
 * This API support multiple inputs, and multi thread safety
 *
 * @param [in]    p_bmrt         Bmruntime that had been created
 * @param [in]    net_name       The name of the neuron network
 * @param [in]    input_tensors  Array of input tensor, defined like bm_tensor_t input_tensors[input_num].
 *                               User should initialize each input tensor.
 * @param [in]    input_num      Input number
 * @param [out]   output_tensors Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
 *                               This interface will alloc devcie mem to store output data. User should free each
 *                               device mem by bm_free_device after the result data not used.
 * @param [in]    output_num     Output number
 *
 * @retval true    Launch success.
 * @retval false   Launch failed.
 */
DECL_EXPORT bool bmrt_launch_tensor(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
                        bm_tensor_t output_tensors[], int output_num);

/**
 * @name    bmrt_launch_tensor_ex
 * @brief   To launch the inference of the neuron network with setting input tensors
 * @ingroup bmruntime
 *
 * This API supports the neuron nework that is static-compiled or dynamic-compiled
 * After calling this API, inference on TPU is launched. And the CPU program will not
 * be blocked. bm_thread_sync should be called to make sure inference finished.
 * This API support multiple inputs, and multi thread safety
 *
 * @param [in]    p_bmrt            Bmruntime that had been created
 * @param [in]    net_name          The name of the neuron network
 * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
 *                                  User should initialize each input tensor.
 * @param [in]    input_num         Input number
 * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
 *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
 *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
 *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
 *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
 * @param [in]    output_num        Output number
 * @param [in]    user_mem          whether device_mem of output tensors are set
 * @param [in]    user_stmode       whether stmode of output tensors are set
 *
 * @retval true    Launch success.
 * @retval false   Launch failed.
 */
DECL_EXPORT bool bmrt_launch_tensor_ex(void* p_bmrt, const char * net_name, const bm_tensor_t input_tensors[], int input_num,
                           bm_tensor_t output_tensors[], int output_num, bool user_mem, bool user_stmode);

/**
 * @name    bmrt_launch_data
 * @brief   To launch the inference of the neuron network with setting input datas in system memory
 * @ingroup bmruntime
 *
 * This API supports the neuron nework that is static-compiled or dynamic-compiled
 * After calling this API, inference on TPU is launched. And the CPU
 * program will be blocked.
 * This API support multiple inputs, and multi thread safety
 *
 * @param [in]    p_bmrt         Bmruntime that had been created
 * @param [in]    net_name       The name of the neuron network
 * @param [in]    input_datas    Array of input data, defined like void * input_datas[input_num]. User should
 *                               initialize each data pointer as input.
 * @param [in]    input_shapes   Array of input shape, defined like bm_shape_t input_shapes[input_num].
 *                               User should set each input shape
 * @param [in]    input_num      Input number
 * @param [out]   output_datas   Array of output data, defined like void * output_datas[output_num].
 *                               If user don't alloc each output data, set user_mem to false, and this api will alloc
 *                               output mem, user should free each output mem when output data not used. Also
 *                               user can alloc system memory for each output data by self and set user_mem = true.
 * @param [out]   output_shapes  Array of output shape, defined like bm_shape_t output_shapes[output_num].
 *                               It will store each output shape.
 * @param [in]    output_num     Output number
 * @param [in]    user_mem       whether output_datas[i] have allocated memory
 *
 * @retval true    Launch success.
 * @retval false   Launch failed.
 */
DECL_EXPORT bool bmrt_launch_data(void* p_bmrt, const char* net_name, void* const input_datas[],
                      const bm_shape_t input_shapes[], int input_num, void * output_datas[],
                      bm_shape_t output_shapes[], int output_num, bool user_mem);

/**
 * @name    bmrt_trace
 * @brief   To check runtime environment, and collect info for DEBUG
 * @ingroup bmruntime
 *
 * This API is to collect runtime info for DEBUG. Expecially when launch result sudden mistake, call bmrt_trace
 * will show whether device mems are broken, and other check info.
 *
 * @param [in]    p_bmrt         Bmruntime that had been created
 */
DECL_EXPORT void bmrt_trace(void* p_bmrt);

/**
 * @name    bmrt_launch_tensor_multi_cores
 * @brief   To launch the inference of the neuron network with setting input tensors, and support multi core inference.
 * @ingroup bmruntime
 *
 * This API supports the neuron nework that is static-compiled or dynamic-compiled
 * After calling this API, inference on TPU is launched. And the CPU program will not
 * be blocked. bm_thread_sync_from_core should be called to make sure inference is finished.
 * This API support multiple inputs, and multi thread safety
 *
 * @param [in]    p_bmrt            Bmruntime that had been created
 * @param [in]    net_name          The name of the neuron network
 * @param [in]    input_tensors     Array of input tensor, defined like bm_tensor_t input_tensors[input_num],
 *                                  User should initialize each input tensor.
 * @param [in]    input_num         Input number
 * @param [out]   output_tensors    Array of output tensor, defined like bm_tensor_t output_tensors[output_num].
 *                                  User can set device_mem or stmode of output tensors. If user_mem is true, this interface
 *                                  will use device mem of output_tensors to store output data, and not alloc device mem;
 *                                  Or it will alloc device mem to store output. If user_stmode is true, it will use stmode in
 *                                  each output tensor; Or stmode will be BM_STORE_1N as default.
 * @param [in]    output_num        Output number
 * @param [in]    user_mem          whether device_mem of output tensors are set
 * @param [in]    user_stmode       whether stmode of output tensors are set
 * @param [in]    core_list         core id list those will be used to inference
 * @param [in]    core_num          number of the core list
 *
 * @retval true    Launch success.
 * @retval false   Launch failed.
 */
DECL_EXPORT bool bmrt_launch_tensor_multi_cores(
    void *p_bmrt,
    const char *net_name,
    const bm_tensor_t input_tensors[],
    int input_num,
    bm_tensor_t output_tensors[],
    int output_num,
    bool user_mem,
    bool user_stmode,
    const int *core_list,
    int core_num);

/**
 *  @name    bmrt_memcpy_s2d_parallel
 *  @brief   To copy data from system memory to muti-devices memory in parallel
 *  @ingroup bmruntime
 *
 *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
 *  After calling this API, datas[:tensor_num[0]] will be copied to the first device, and
 *  datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] will be copied to the second device and so on.
 *  The process of copying data to different devices is done in parallel and to the same device is in sequence.
 * 
 *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
 *  @param [in]     tensors     Array of tensors that will be copied to devices
 *  @param [in]     datas       Array of satas allocated in system memory
 *  @param [in]     tensor_num  Array of tensor_num that will be copied to each device
 *  @param [in]     device_num  Device number
*/
DECL_EXPORT bool bmrt_memcpy_s2d_parallel(
    void *p_bmrt,
    bm_tensor_t tensors[],
    void *datas[],
    int tensor_num[],
    int device_num);

/**
 *  @name    bmrt_memcpy_d2s_parallel
 *  @brief   To copy data from muti-devices memory to system memory in parallel
 *  @ingroup bmruntime
 *
 *  This API only could be used when the p_bmrt is created with bmrt_create_ex on multi devices.
 *  After calling this API, tensors on the first device will be copied to datas[:tensor_num[0]] , and
 *  tensors on the second device will be copied to datas[tensor_num[0]:tensor_num[0]+tensor_num[1]] and so on.
 *  The process of copying data from different devices is done in parallel and from the same device is in sequence.
 * 
 *  @param [in]     p_bmrt      Bmruntime that had been created with multi bm_handles
 *  @param [in]     datas       Array of satas allocated in system memory
 *  @param [in]     tensors     Array of tensors that will be copied from devices
 *  @param [in]     tensor_num  Array of tensor_num that will be copied from each device
 *  @param [in]     device_num  Device number
*/
DECL_EXPORT bool bmrt_memcpy_d2s_parallel(
    void *p_bmrt,
    void *datas[],
    bm_tensor_t tensors[],
    int tensor_num[],
    int device_num);

#if defined (__cplusplus)
}
#endif

#endif