intel · lvliang-intel · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 19, 2026
diff --git a/.azure-pipelines/build-auto-round-lib.yml b/.azure-pipelines/build-auto-round-lib.yml
@@ -0,0 +1,17 @@
+trigger:
+  branches:
+    include:
+      - main
+  paths:
+    include:
+      - auto_round_extension/ark/**
+      - .azure-pipelines/build-auto-round-lib.yml
+      - .azure-pipelines/template/lib-build-template.yml
+    exclude:
+      - "*.md"
+      - "**/*.md"
+
+stages:
+  - template: template/lib-build-template.yml
+    parameters:
+      publishToTestPyPI: true
diff --git a/.azure-pipelines/code-scan.yml b/.azure-pipelines/code-scan.yml
@@ -24,19 +24,6 @@ variables:
   CODE_SCAN_LOG_PATH: ".azure-pipelines/scripts/codeScan/scanLog"
 
 stages:
-
-  - stage: BanditCodeScan
-    displayName: Bandit Code Scan
-    dependsOn: []
-    jobs:
-      - job: Bandit
-        displayName: Bandit
-        steps:
-          - template: template/code-scan-template.yml
-            parameters:
-              codeScanFileName: "bandit"
-              uploadPath: "bandit.log"
-
   - stage: PylintCodeScan
     displayName: Pylint Code Scan
     dependsOn: []

diff --git a/.azure-pipelines/docker/Dockerfile_xpu.devel b/.azure-pipelines/docker/Dockerfile_xpu.devel
@@ -17,7 +17,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     wget \
     bc \
     jq \
-    vim
+    vim \
+    python3.12 \
+    python3.12-dev \
+    python3-pip
 
 RUN apt-get install -y software-properties-common \
     && add-apt-repository -y ppa:kobuk-team/intel-graphics \

diff --git a/.azure-pipelines/scripts/codeScan/bandit/bandit.sh b/.azure-pipelines/scripts/codeScan/bandit/bandit.sh
diff --git a/.azure-pipelines/scripts/cuda_unit_test/monitor_gpu.py b/.azure-pipelines/scripts/cuda_unit_test/monitor_gpu.py
@@ -0,0 +1,170 @@
+# /// script
+# requires-python = ">=3.8"
+# dependencies = [
+#     "nvitop",
+#     "plotille",
+# ]
+# ///
+
+import math
+import os
+import sys
+import time
+
+import plotille
+from nvitop import Device
+
+SIGNAL_FILE = "stop_monitor.flag"
+DATA_FILE = "gpu_metrics.csv"
+
+
+def run_daemon():
+    """Background daemon: streams metrics to a local file in real time."""
+    if os.path.exists(SIGNAL_FILE):
+        os.remove(SIGNAL_FILE)
+    if os.path.exists(DATA_FILE):
+        os.remove(DATA_FILE)
+
+    print("GPU Monitor Daemon started.")
+
+    try:
+        device = Device(0)  # Monitor GPU 0; adjust if multiple GPUs are present
+    except Exception as e:
+        print(f"NVML error: {e}")
+        sys.exit(0)
+
+    start_time = time.time()
+    print("Daemon is running. Streaming metrics to CSV every 5 seconds...")
+
+    with open(DATA_FILE, "w") as f:
+        f.write("elapsed_sec,gpu_util_pct,mem_used_gb\n")
+
+        while not os.path.exists(SIGNAL_FILE):
+            try:
+                elapsed_sec = int(time.time() - start_time)
+                util = device.gpu_utilization()
+                util_val = util if util is not None else 0
+                mem_bytes = device.memory_used()
+                mem_gb = round((mem_bytes / (1024**3)), 2) if mem_bytes is not None else 0.0
+
+                f.write(f"{elapsed_sec},{util_val},{mem_gb}\n")
+                f.flush()
+            except Exception:
+                pass
+
+            time.sleep(5.0)
+
+    print("Stop signal received. Daemon exiting.")
+
+
+def stop_and_plot():
+    """Stop monitoring and plot charts with clean axis limits."""
+    with open(SIGNAL_FILE, "w") as f:
+        f.write("STOP")
+
+    time.sleep(2)
+
+    if not os.path.exists(DATA_FILE):
+        print("Error: Could not find GPU metrics data file.")
+        sys.exit(1)
+
+    timestamps_sec = []
+    gpu_util = []
+    mem_gb = []
+
+    with open(DATA_FILE, "r") as f:
+        lines = f.readlines()[1:]  # skip header
+        for line in lines:
+            parts = line.strip().split(",")
+            if len(parts) == 3:
+                timestamps_sec.append(int(parts[0]))
+                gpu_util.append(int(parts[1]))
+                mem_gb.append(float(parts[2]))
+
+    if not timestamps_sec:
+        print("No valid data to plot.")
+        return
+
+    # --- Dynamic time unit selection ---
+    max_sec = max(timestamps_sec) if timestamps_sec else 1
+    if max_sec < 300:
+        x_data = timestamps_sec
+        x_label = "Time (Seconds)"
+    elif max_sec < 7200:
+        x_data = [round(t / 60.0, 2) for t in timestamps_sec]
+        x_label = "Time (Minutes)"
+    else:
+        x_data = [round(t / 3600.0, 2) for t in timestamps_sec]
+        x_label = "Time (Hours)"
+
+    # --- Compute clean axis upper limits ---
+    SCALE = 11
+
+    max_x = max(x_data) if x_data and max(x_data) > 0 else 1
+    max_mem = max(mem_gb) if mem_gb and max(mem_gb) > 0 else 1
+
+    x_lim = math.ceil(max_x / 8.0) * 8
+    y_lim_mem = math.ceil(max_mem / 10.0) * SCALE
+
+    # ==========================================
+    # Workaround for Boundary Clipping:
+    # Nudge values that sit exactly on the axis limit slightly inward
+    # so plotille does not clip them at the canvas edge.
+    # ==========================================
+    safe_x_data = [min(x, x_lim - 0.0001) for x in x_data]
+    safe_gpu_util = gpu_util
+    safe_mem_gb = [min(m, y_lim_mem - 0.001) for m in mem_gb]
+
+    print("\n" + "=" * 35 + " GPU Utilization (%) " + "=" * 35)
+    try:
+        print(
+            plotille.plot(
+                safe_x_data,
+                safe_gpu_util,
+                height=SCALE,
+                width=80,
+                X_label=x_label,
+                Y_label="GPU Util (%)",
+                x_min=0,
+                x_max=x_lim,
+                y_min=0,
+                y_max=SCALE * 10,
+                interp="linear",
+            )
+        )
+    except Exception as e:
+        print(f"Failed to plot GPU Utilization: {e}")
+
+    print("\n" + "=" * 35 + " GPU Memory Used (GB) " + "=" * 35)
+    try:
+        print(
+            plotille.plot(
+                safe_x_data,
+                safe_mem_gb,
+                height=SCALE,
+                width=80,
+                X_label=x_label,
+                Y_label="Memory (GB)",
+                x_min=0,
+                x_max=x_lim,
+                y_min=0,
+                y_max=y_lim_mem,
+                interp="linear",
+            )
+        )
+    except Exception as e:
+        print(f"Failed to plot GPU Memory: {e}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python monitor_gpu.py [daemon|stop]")
+        sys.exit(1)
+
+    command = sys.argv[1]
+    if command == "daemon":
+        run_daemon()
+    elif command == "stop":
+        stop_and_plot()
+    else:
+        print("Unknown command.")
diff --git a/.azure-pipelines/scripts/cuda_unit_test/run_cuda_ut.sh b/.azure-pipelines/scripts/cuda_unit_test/run_cuda_ut.sh
@@ -62,8 +62,7 @@ function run_unit_test() {
     # install unit test dependencies
     echo "##[group]set up UT env..."
     cd "${BUILD_SOURCESDIRECTORY}" || exit 1
-    uv pip install https://github.com/XuehaoSun/GPTQModel/releases/download/v5.8.0/gptqmodel-5.8.0+cu128torch2.11-cp312-cp312-linux_x86_64.whl
-    uv pip install -r https://raw.githubusercontent.com/ModelCloud/GPTQModel/refs/tags/v5.8.0/requirements.txt
+    uv pip install torch==2.11.0 torchvision --index-url https://download.pytorch.org/whl/cu128
     uv pip install https://github.com/XuehaoSun/llama-cpp-python/releases/download/v0.3.16/llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl
     uv pip install 'git+https://github.com/ggml-org/llama.cpp.git#subdirectory=gguf-py'
     uv pip install -r test/test_cuda/requirements.txt
@@ -112,7 +111,10 @@ function run_unit_test_llmc() {
     rm -rf /root/.venv
     uv venv --python=3.12 /root/.venv
     uv pip install -U pytest-cov pytest-html
-    BUILD_TYPE="nightly" uv pip install -r test/test_cuda/requirements_llmc.txt --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match
+    BUILD_TYPE="nightly" uv pip install \
+        -r test/test_cuda/requirements_llmc.txt \
+        --extra-index-url https://download.pytorch.org/whl/cu130 \
+        --index-strategy unsafe-best-match
     uv pip install .
     uv pip list
     echo "##[endgroup]"
@@ -138,7 +140,10 @@ function run_unit_test_sglang() {
     rm -rf /root/.venv
     uv venv --python=3.12 /root/.venv
     uv pip install -U pytest-cov pytest-html
-    uv pip install -r test/test_cuda/requirements_sglang.txt
+    uv pip install -r test/test_cuda/requirements_sglang.txt \
+        --prerelease=allow \
+        --extra-index-url https://download.pytorch.org/whl/cu130 \
+        --index-strategy unsafe-best-match
     uv pip install .
     uv pip list
     echo "##[endgroup]"
@@ -163,7 +168,10 @@ function run_unit_test_vllm() {
     rm -rf /root/.venv
     uv venv --python=3.12 /root/.venv
     uv pip install -U pytest-cov pytest-html
-    uv pip install -r test/test_cuda/requirements_vllm.txt --extra-index-url https://download.pytorch.org/whl/cu128 --index-strategy unsafe-best-match
+    vllm_version=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
+    uv pip install -r test/test_cuda/requirements_vllm.txt \
+        --extra-index-url https://download.pytorch.org/whl/cu130 \
+        --index-strategy unsafe-best-match
     uv pip install .
     uv pip list
     echo "##[endgroup]"

diff --git a/.azure-pipelines/scripts/cuda_unit_test/runpod_manager.py b/.azure-pipelines/scripts/cuda_unit_test/runpod_manager.py
@@ -11,6 +11,7 @@
     "NVIDIA RTX 6000 Ada Generation",
     "NVIDIA L40S",
     "NVIDIA RTX PRO 6000 Blackwell Server Edition",
+    "NVIDIA RTX PRO 6000 Blackwell Workstation Edition",
 ]
 DATA_CENTER_IDS = [
     "AP-JP-1",
@@ -40,10 +41,11 @@
     "US-TX-4",
     "US-WA-1",
 ]
-DATA_CENTER_BAN_LIST = ["EUR-IS-2", "US-IL-1"]
+DATA_CENTER_BAN_LIST = ["EUR-IS-1", "EUR-IS-2", "US-IL-1"]
 DATA_CENTER_SELECT_LIST = [dc for dc in DATA_CENTER_IDS if dc not in DATA_CENTER_BAN_LIST]
 REQUIRED_COUNT = 1
 IMAGES_NAME = "xuehaosu/azure-agent:v0.1"
+CUDA_VERSIONS = ["13.0", "12.9", "12.8", "12.7", "12.6", "12.5", "12.4", "12.3", "12.2", "12.1", "12.0", "11.8"]
 
 
 def run_create_pod(api_key, payload):
@@ -89,6 +91,7 @@ def create_pod(args):
         env_dict = {kv.split("=", 1)[0]: kv.split("=", 1)[1] for kv in args.env}
 
     payload = {
+        "allowedCudaVersions": [args.cuda_version] if (args.cuda_version and args.cuda_version in CUDA_VERSIONS) else CUDA_VERSIONS,
         "cloudType": "SECURE",
         "containerDiskInGb": args.container_disk_size,
         "dataCenterIds": DATA_CENTER_SELECT_LIST,
@@ -195,6 +198,7 @@ def main():
     parser.add_argument("--gpu_count", type=int, default=1)
     parser.add_argument("--container_disk_size", type=int, default=50)
     parser.add_argument("--part", type=int, default=0)
+    parser.add_argument("--cuda_version", help="CUDA version for the pod")
     parser.add_argument("--env", nargs="*", help="Environment variables in KEY=VALUE format")
 
     args = parser.parse_args()

diff --git a/.azure-pipelines/scripts/ut/collect_result.py b/.azure-pipelines/scripts/ut/collect_result.py
@@ -114,8 +114,12 @@ def _determine_status(self, content: str) -> TestStatus:
         return TestStatus.NO_TESTS
 
     def _extract_duration(self, content: str) -> str:
-        if match := self.TIME_PATTERN.search(content):
-            total_seconds = int(float(match.group(1)))
+        last_match = None
+        for match in self.TIME_PATTERN.finditer(content):
+            last_match = match
+
+        if last_match:
+            total_seconds = int(float(last_match.group(1)))
             hours, remainder = divmod(total_seconds, 3600)
             minutes, seconds = divmod(remainder, 60)
             return f"{hours:02d}:{minutes:02d}:{seconds:02d}"