airockchip
diff --git a/‎CHANGELOG.md‎
Lines changed: 16 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 84 additions & 56 deletions b/‎README.md‎
Lines changed: 84 additions & 56 deletions
diff --git a/‎doc/Rockchip_RKLLM_SDK_CN_1.1.4.pdf‎
-1.46 MB b/‎doc/Rockchip_RKLLM_SDK_CN_1.1.4.pdf‎
-1.46 MB
diff --git a/‎doc/Rockchip_RKLLM_SDK_CN_1.2.0.pdf‎
3.24 MB b/‎doc/Rockchip_RKLLM_SDK_CN_1.2.0.pdf‎
3.24 MB
diff --git a/‎doc/Rockchip_RKLLM_SDK_EN_1.1.4.pdf‎
-1.28 MB b/‎doc/Rockchip_RKLLM_SDK_EN_1.1.4.pdf‎
-1.28 MB
diff --git a/‎doc/Rockchip_RKLLM_SDK_EN_1.2.0.pdf‎
3.89 MB b/‎doc/Rockchip_RKLLM_SDK_EN_1.2.0.pdf‎
3.89 MB
diff --git a/‎examples/DeepSeek-R1-Distill-Qwen-1.5B_Demo/Readme.md‎
Lines changed: 10 additions & 4 deletions b/‎examples/DeepSeek-R1-Distill-Qwen-1.5B_Demo/Readme.md‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎examples/DeepSeek-R1-Distill-Qwen-1.5B_Demo/deploy/src/llm_demo.cpp‎
Lines changed: 21 additions & 9 deletions b/‎examples/DeepSeek-R1-Distill-Qwen-1.5B_Demo/deploy/src/llm_demo.cpp‎
Lines changed: 21 additions & 9 deletions
@@ -1,4 +1,20 @@
 # CHANGELOG
+## v1.2.0
+
+- Supports custom model conversion.
+- Supports chat_template configuration.
+- Enables multi-turn dialogue interactions.
+- Implements automatic prompt cache reuse for improved inference efficiency.
+- Expands maximum context length to 16K.
+- Supports embedding flash storage to reduce memory usage.
+- Introduces the GRQ Int4 quantization algorithm.
+- Supports GPTQ-Int8 model conversion.
+- Compatible with the RK3562 platform.
+- Added support for visual multimodal models such as InternVL2, Janus, and Qwen2.5-VL.
+- Supports CPU core configuration.
+- Added support for Gemma3
+- Added support for Python 3.9/3.11/3.12
+
 ## v1.1.0
 - Support group-wise quantization (w4a16 group sizes of 32/64/128, w8a8 group sizes of 128/256/512).
 - Support joint inference with LoRA model loading
 
@@ -18,6 +18,7 @@
 
 - RK3588 Series
 - RK3576 Series
+- RK3562 Series
 
 # Support Models
 
@@ -26,50 +27,61 @@
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
 - [x] [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b/tree/103caa40027ebfd8450289ca2f278eac4ff26405)
-- [x] [Gemma models](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
+- [x] [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
+- [x] [Gemma3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d)
 - [x] [InternLM2 models](https://huggingface.co/collections/internlm/internlm2-65b0ce04970888799707893c)
 - [x] [MiniCPM models](https://huggingface.co/collections/openbmb/minicpm-65d48bf958302b9fd25b698f)
 - [x] [TeleChat models](https://huggingface.co/Tele-AI)
-- [x] [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
-- [x] [MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V-2_6)
+- [x] [Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
+- [x] [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)
 - [x] [DeepSeek-R1-Distill](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
+- [x] [Janus-Pro-1B](https://huggingface.co/deepseek-ai/Janus-Pro-1B)
+- [x] [InternVL2-1B](https://huggingface.co/OpenGVLab/InternVL2-1B)
+- [x] [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
 
 # Model Performance Benchmark
 
-| llm model      | dtype      | seqlen | max_context | new_tokens | TTFT(ms) | Tokens/s | memory(G) | platform |
-| :------------- | :--------- | :----: | :---------: | :--------: | :------: | :------: | :-------: | :------: |
-| TinyLLAMA-1.1B | w4a16      |   64   |     320     |    256     |  345.00  |  21.10   |   0.77    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     |  410.00  |  18.50   |    0.8    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     |  140.46  |  24.21   |   1.25    |  RK3588  |
-|                | w8a8_g512  |   64   |     320     |    256     |  195.00  |  20.08   |   1.29    |  RK3588  |
-| Qwen2-1.5B     | w4a16      |   64   |     320     |    256     |  512.00  |  14.40   |   1.75    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     |  550.00  |  12.75   |   1.76    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     |  206.00  |  16.46   |   2.47    |  RK3588  |
-|                | w8a8_g128  |   64   |     320     |    256     |  725.00  |   7.00   |   2.65    |  RK3588  |
-| Phi-3-3.8B     | w4a16      |   64   |     320     |    256     |  975.00  |   6.60   |   2.16    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     | 1180.00  |   5.85   |   2.23    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     |  516.00  |   7.44   |   3.88    |  RK3588  |
-|                | w8a8_g512  |   64   |     320     |    256     |  610.00  |   6.13   |   3.95    |  RK3588  |
-| ChatGLM3-6B    | w4a16      |   64   |     320     |    256     | 1168.00  |   4.62   |   3.86    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     | 1582.56  |   3.82   |   3.96    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     |  800.00  |   4.95   |   6.69    |  RK3588  |
-|                | w8a8_g128  |   64   |     320     |    256     | 2190.00  |   2.70   |   7.18    |  RK3588  |
-| Gemma2-2B      | w4a16      |   64   |     320     |    256     |  628.00  |   8.00   |   3.63    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     |  776.20  |   7.40   |   3.63    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     |  342.29  |   9.67   |   4.84    |  RK3588  |
-|                | w8a8_g128  |   64   |     320     |    256     | 1055.00  |   5.49   |   5.14    |  RK3588  |
-| InternLM2-1.8B | w4a16      |   64   |     320     |    256     |  475.00  |  13.30   |   1.59    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     |  572.00  |  11.95   |   1.62    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     |  205.97  |  15.66   |   2.38    |  RK3588  |
-|                | w8a8_g512  |   64   |     320     |    256     |  298.00  |  12.66   |   2.45    |  RK3588  |
-| MiniCPM3-4B    | w4a16      |   64   |     320     |    256     | 1397.00  |   4.80   |    2.7    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     | 1645.00  |   4.39   |    2.8    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     |  702.18  |   6.15   |   4.65    |  RK3588  |
-|                | w8a8_g128  |   64   |     320     |    256     | 1691.00  |   3.42   |   5.06    |  RK3588  |
-| llama3-8B      | w4a16      |   64   |     320     |    256     | 1607.98  |   3.60   |   5.63    |  RK3576  |
-|                | w4a16_g128 |   64   |     320     |    256     | 2010.00  |   3.00   |   5.76    |  RK3576  |
-|                | w8a8       |   64   |     320     |    256     | 1128.00  |   3.79   |   9.21    |  RK3588  |
-|                | w8a8_g512  |   64   |     320     |    256     | 1281.35  |   3.05   |   9.45    |  RK3588  |
+| llm model      | platform | dtype      | seqlen | max_context | new_tokens | TTFT(ms) | Tokens/s | memory(G) |
+| :------------- | :------: | :--------- | :----: | :---------: | :--------: | :------: | :------: | :-------: |
+| Qwen2-0.5B     |  RK3562  | w4a16_g128 |   64   |     320     |    256     |   524    |   5.67   |   0.39    |
+|                |  RK3562  | w4a8_g32   |   64   |     320     |    256     |   873    |  12.00   |   0.48    |
+|                |  RK3562  | w8a8       |   64   |     320     |    256     |   477    |  11.50   |   0.61    |
+|                |  RK3576  | w4a16      |   64   |     320     |    256     |   204    |  34.50   |   0.40    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   212    |  32.40   |   0.40    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |    79    |  41.50   |   0.62    |
+|                |  RK3588  | w8a8_g128  |   64   |     320     |    256     |   183    |  25.07   |   0.75    |
+| TinyLLAMA-1.1B |  RK3576  | w4a16      |   64   |     320     |    256     |   345    |  21.10   |   0.77    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   410    |  18.50   |   0.80    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   140    |  24.21   |   1.25    |
+|                |  RK3588  | w8a8_g512  |   64   |     320     |    256     |   195    |  20.08   |   1.29    |
+| Qwen2-1.5B     |  RK3576  | w4a16      |   64   |     320     |    256     |   512    |  14.40   |   1.75    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   550    |  12.75   |   1.76    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   206    |  16.46   |   2.47    |
+|                |  RK3588  | w8a8_g128  |   64   |     320     |    256     |   725    |   7.00   |   2.65    |
+| Phi-3-3.8B     |  RK3576  | w4a16      |   64   |     320     |    256     |   975    |   6.60   |   2.16    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   1180   |   5.85   |   2.23    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   516    |   7.44   |   3.88    |
+|                |  RK3588  | w8a8_g512  |   64   |     320     |    256     |   610    |   6.13   |   3.95    |
+| ChatGLM3-6B    |  RK3576  | w4a16      |   64   |     320     |    256     |   1168   |   4.62   |   3.86    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   1583   |   3.82   |   3.96    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   800    |   4.95   |   6.69    |
+|                |  RK3588  | w8a8_g128  |   64   |     320     |    256     |   2190   |   2.70   |   7.18    |
+| Gemma2-2B      |  RK3576  | w4a16      |   64   |     320     |    256     |   628    |   8.00   |   3.63    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   776    |   7.40   |   3.63    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   342    |   9.67   |   4.84    |
+|                |  RK3588  | w8a8_g128  |   64   |     320     |    256     |   1055   |   5.49   |   5.14    |
+| InternLM2-1.8B |  RK3576  | w4a16      |   64   |     320     |    256     |   475    |  13.30   |   1.59    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   572    |  11.95   |   1.62    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   206    |  15.66   |   2.38    |
+|                |  RK3588  | w8a8_g512  |   64   |     320     |    256     |   298    |  12.66   |   2.45    |
+| MiniCPM3-4B    |  RK3576  | w4a16      |   64   |     320     |    256     |   1397   |   4.80   |   2.70    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   1645   |   4.39   |   2.80    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   702    |   6.15   |   4.65    |
+|                |  RK3588  | w8a8_g128  |   64   |     320     |    256     |   1691   |   3.42   |   5.06    |
+| llama3-8B      |  RK3576  | w4a16      |   64   |     320     |    256     |   1608   |   3.60   |   5.63    |
+|                |  RK3576  | w4a16_g128 |   64   |     320     |    256     |   2010   |   3.00   |   5.76    |
+|                |  RK3588  | w8a8       |   64   |     320     |    256     |   1128   |   3.79   |   9.21    |
+|                |  RK3588  | w8a8_g512  |   64   |     320     |    256     |   1281   |   3.05   |   9.45    |
 
 | multimodal model | image input size | vision model dtype | vision infer time(s) | vision memory(MB) | llm model dtype | seqlen | max_context | new_tokens | TTFT(ms) | Tokens/s | llm memory(G) | platform |
 |:-------------- |:---------- |:------:|:-----------:|:----------:|:--------:|:--------:|:---------:|:--------:|:---------:|:---------:|:---------:|:---------:|
@@ -78,10 +90,17 @@
 | MiniCPM-V-2_6 | (1, 3, 448, 448) | fp16 | 2.40 | 1031.30 | w4a16 | 128 | 256 | 128 | 2997.70 | 3.84 | 5.50 | RK3576 |
 |                            |    | fp16  | 3.27 | 976.98 | w8a8 | 128 | 256 | 128 | 1720.60 | 4.13 | 8.88 | RK3588 |
 
-- This performance data were collected based on the maximum CPU and NPU frequencies of each platform with version 1.1.0. 
+- This performance data were collected based on the maximum CPU and NPU frequencies of each platform. 
 - The script for setting the frequencies is located in the scripts directory.
 - The vision model were tested based on all NPU core with rknn-toolkit2 version 2.2.0.
 
+# **Performance Testing Methods**
+
+1. Run the frequency-setting script from the `scripts` directory on the target platform.
+2. Execute `export RKLLM_LOG_LEVEL=1` on the device to log model inference performance and memory usage.
+3. Use the `eval_perf_watch_cpu.sh` script to measure CPU utilization.
+4. Use the `eval_perf_watch_npu.sh` script to measure NPU utilization.
+
 # Download
 
 1. You can download the **latest package** from [RKLLM_SDK](https://console.zbox.filez.com/l/RJJDmB), fetch code: rkllm
@@ -92,18 +111,25 @@
 1. Multimodel deployment demo:   [Qwen2-VL-2B_Demo](https://github.com/airockchip/rknn-llm/tree/main/examples/Qwen2-VL-2B_Demo)
 2. API usage demo:  [DeepSeek-R1-Distill-Qwen-1.5B_Demo](https://github.com/airockchip/rknn-llm/tree/main/examples/DeepSeek-R1-Distill-Qwen-1.5B_Demo)
 3. API server demo:  [rkllm_server_demo](https://github.com/airockchip/rknn-llm/tree/main/examples/rkllm_server_demo)
+4. Multimodal_Interactive_Dialogue_Demo  [Multimodal_Interactive_Dialogue_Demo](https://github.com/airockchip/rknn-llm/tree/main/examples/Multimodal_Interactive_Dialogue_Demo)
 
 # Note
 
-- The modifications in version 1.1 are significant, making it incompatible with older version models. Please use the latest toolchain for model conversion and inference.
-
 - The supported Python versions are:
-  
+
   - Python 3.8
-  
+  - Python 3.9
   - Python 3.10
+  - Python 3.11
+  - Python 3.12
+
+**Note: Before installing package in a Python 3.12 environment, please run the command:**
 
-- Latest version: [ <u>v1.1.4](https://github.com/airockchip/rknn-llm/releases/tag/release-v1.1.4)</u>
+```
+export BUILD_CUDA_EXT=0
+```
+- On some platforms, you may encounter an error indicating that **libomp.so** cannot be found. To resolve this, locate the library in the corresponding cross-compilation toolchain and place it in the board's lib directory, at the same level as librkllmrt.so.
+- Latest version: [ <u>v1.2.0](https://github.com/airockchip/rknn-llm/releases/tag/release-v1.2.0)</u>
 
 # RKNN Toolkit2
 
@@ -113,18 +139,20 @@ https://github.com/airockchip/rknn-toolkit2
 
 # CHANGELOG
 
-## v1.1.0
-
-- Support group-wise quantization (w4a16 group sizes of 32/64/128, w8a8 group sizes of 128/256/512).
-- Support joint inference with LoRA model loading
-- Support storage and preloading of prompt cache.
-- Support gguf model conversion (currently only support q4_0 and fp16).
-- Optimize initialization, prefill, and decode time.
-- Support four input types: prompt, embedding, token, and multimodal.
-- Add PC-based simulation accuracy testing and inference interface support for rkllm-toolkit.
-- Add gdq algorithm to improve 4-bit quantization accuracy.
-- Add mixed quantization algorithm, supporting a combination of grouped and non-grouped quantization based on specified ratios.
-- Add support for models such as Llama3, Gemma2, and MiniCPM3.
-- Resolve catastrophic forgetting issue when the number of tokens exceeds max_context.
+## v1.2.0
+
+- Supports custom model conversion.
+- Supports chat_template configuration.
+- Enables multi-turn dialogue interactions.
+- Implements automatic prompt cache reuse for improved inference efficiency.
+- Expands maximum context length to 16K.
+- Supports embedding flash storage to reduce memory usage.
+- Introduces the GRQ Int4 quantization algorithm.
+- Supports GPTQ-Int8 model conversion.
+- Compatible with the RK3562 platform.
+- Added support for visual multimodal models such as InternVL2, Janus, and Qwen2.5-VL.
+- Supports CPU core configuration.
+- Added support for Gemma3
+- Added support for Python 3.9/3.11/3.12
 
 for older version, please refer [CHANGELOG](CHANGELOG.md)
@@ -5,9 +5,9 @@
 ## 1. Requirements
 
 ```
-rkllm-toolkit==1.1.4
-rkllm-runtime==1.1.4
-python==3.8 or python==3.10
+rkllm-toolkit==1.2.0
+rkllm-runtime==1.2.0
+python >=3.8
 ```
 
 ## 2. Model Conversion
@@ -40,6 +40,8 @@ cd deploy
 adb push install/demo_Linux_aarch64 /data
 # push model file to device
 adb push DeepSeek-R1-Distill-Qwen-1.5B.rkllm /data/demo_Linux_aarch64
+# push the appropriate fixed-frequency script to the device
+adb push ../../../scripts/fix_freq_rk3588.sh /data/demo_Linux_aarch64
 ```
 
 ### 2. Run Demo
@@ -51,7 +53,11 @@ adb shell
 cd /data/demo_Linux_aarch64
 # export lib path
 export LD_LIBRARY_PATH=./lib
-taskset f0 ./llm_demo /path/to/your/rkllm/model 2048 4096
+# Execute the fixed-frequency script
+sh fix_freq_rk3588.sh
+# Set the logging level for performance analysis
+export RKLLM_LOG_LEVEL=1
+./llm_demo /path/to/your/rkllm/model 2048 4096
 
 # Running result                                                          
 rkllm init start
 
@@ -21,8 +21,6 @@
 #include <csignal>
 #include <vector>
 
-#define PROMPT_TEXT_PREFIX "<｜begin▁of▁sentence｜><｜User｜>"
-#define PROMPT_TEXT_POSTFIX "<｜Assistant｜>"
 
 using namespace std;
 LLMHandle llmHandle = nullptr;
@@ -48,7 +46,7 @@ void callback(RKLLMResult *result, void *userdata, LLMCallState state)
         printf("\n");
     } else if (state == RKLLM_RUN_ERROR) {
         printf("\\run error\n");
-    } else if (state == RKLLM_RUN_GET_LAST_HIDDEN_LAYER) {
+    } else if (state == RKLLM_RUN_NORMAL) {
         /* ================================================================================================================
         若使用GET_LAST_HIDDEN_LAYER功能,callback接口会回传内存指针:last_hidden_layer,token数量:num_tokens与隐藏层大小:embd_size
         通过这三个参数可以取得last_hidden_layer中的数据
@@ -66,7 +64,6 @@ void callback(RKLLMResult *result, void *userdata, LLMCallState state)
                 std::cerr << "Failed to open the file for writing!" << std::endl;
             }
         }
-    } else if (state == RKLLM_RUN_NORMAL) {
         printf("%s", result->text);
     }
 }
@@ -97,6 +94,7 @@ int main(int argc, char **argv)
     param.max_context_len = std::atoi(argv[3]);
     param.skip_special_token = true;
     param.extend_param.base_domain_id = 0;
+    param.extend_param.embed_flash = 1;
 
     int ret = rkllm_init(&llmHandle, &param, callback);
     if (ret == 0){
@@ -118,7 +116,6 @@ int main(int argc, char **argv)
     cout << "\n*************************************************************************\n"
          << endl;
 
-    string text;
     RKLLMInput rkllm_input;
 
     // 初始化 infer 参数结构体
@@ -158,7 +155,15 @@ int main(int argc, char **argv)
     // rkllm_load_prompt_cache(llmHandle, "./prompt_cache.bin"); // 加载缓存的cache
 
     rkllm_infer_params.mode = RKLLM_INFER_GENERATE;
-
+    // By default, the chat operates in single-turn mode (no context retention)
+    // 0 means no history is retained, each query is independent
+    rkllm_infer_params.keep_history = 0;
+
+    //The model has a built-in chat template by default, which defines how prompts are formatted  
+    //for conversation. Users can modify this template using this function to customize the  
+    //system prompt, prefix, and postfix according to their needs.  
+    rkllm_set_chat_template(llmHandle, "", "<｜User｜>", "<｜Assistant｜>");
+    
     while (true)
     {
         std::string input_str;
@@ -169,6 +174,15 @@ int main(int argc, char **argv)
         {
             break;
         }
+        if (input_str == "clear")
+        {
+            ret = rkllm_clear_kv_cache(llmHandle, 1);
+            if (ret != 0)
+            {
+                printf("clear kv cache failed!\n");
+            }
+            continue;
+        }
         for (int i = 0; i < (int)pre_input.size(); i++)
         {
             if (input_str == to_string(i))
@@ -177,10 +191,8 @@ int main(int argc, char **argv)
                 cout << input_str << endl;
             }
         }
-        text = PROMPT_TEXT_PREFIX + input_str + PROMPT_TEXT_POSTFIX;
-        // text = input_str;
         rkllm_input.input_type = RKLLM_INPUT_PROMPT;
-        rkllm_input.prompt_input = (char *)text.c_str();
+        rkllm_input.prompt_input = (char *)input_str.c_str();
         printf("robot: ");
 
         // 若要使用普通推理功能,则配置rkllm_infer_mode为RKLLM_INFER_GENERATE或不配置参数