llama.cpp + Qwen3.6-35B-A3B + Qwen Code Companion

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
export CUDA_HOME=/usr/local/cuda/
export PATH=$PATH:/usr/local/cuda/bin

# 3080Ti
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86"
# 5090D	 
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"

cmake --build ./build/ --config Release -j 8

cd ./build/bin/
llama-server -h

./llama-server \
  -m /data/ornith-1.0-9b-Q8_0.gguf \
  --host 0.0.0.0 \
  --port 8099 \
  --flash-attn on \
  --repeat-penalty 1.1 \
  --reasoning-preserve \
  --temperature 0.6 \
  --top-p 0.95 \
  --top-k 20 \
  -ctk q8_0 \
  -ctv q8_0 \
  -c 262144
存在死循环的情况


./llama-server -m /data/Qwen3.6-35B-A3B-APEX-I-Quality.gguf \
  --host 0.0.0.0       \
  --port 8099          \
  --flash-attn on      \
  --repeat-penalty 1.1 \
  --reasoning-preserve \
  --temperature 0.6    \
  --top-p 0.95         \
  --top-k 20           \
  -ctk q8_0            \
  -ctv q8_0            \
  --split-mode layer   \
  -b 2048              \
  --no-mmap            \
  -c 262144 --n-cpu-moe 32 -ub 2048

256K上下文 -c 262144 --n-cpu-moe 30 -ub 512
3080Ti  推理409t/s,推理耗时52.64s,输出38t/s
256K上下文 -c 262144 --n-cpu-moe 32 -ub 2048
3080Ti  推理1203t/s,推理耗时19.96s,输出37t/s

192K上下文 -c 196608 --n-cpu-moe 29 -ub 512
3080Ti  推理419t/s,推理耗时51.34s,输出39t/s
192K上下文 -c 196608 --n-cpu-moe 30 -ub 2048
3080Ti  推理1285t/s,推理耗时18.89s,输出39t/s

160K上下文 -c 163840 --n-cpu-moe 27 -ub 512
3080Ti  推理442t/s,推理耗时48.84s,输出40t/s
160K上下文 -c 163840 --n-cpu-moe 29 -ub 2048
3080Ti  推理1314t/s,推理耗时18.52s,输出38t/s

128K上下文 -c 131072 --n-cpu-moe 26 -ub 512
3080Ti  推理456t/s,推理耗时47.56s,输出42t/s
128K上下文 -c 131072 --n-cpu-moe 27 -ub 2048
3080Ti  推理1377t/s,推理耗时17.68s,输出40t/s

5090D 32G:
./llama-server -m /data/Qwen3.6-35B-A3B-APEX-I-Balanced.gguf \
  --host 0.0.0.0       \
  --port 8099          \
  --flash-attn on      \
  --repeat-penalty 1.1 \
  --reasoning-preserve \
  --temperature 0.6    \
  --top-p 0.95         \
  --top-k 20           \
  -ctk q8_0            \
  -ctv q8_0            \
  --split-mode layer   \
  -b 2048              \
  --no-mmap            \
  -c 262144 --n-cpu-moe 0 -ub 2048
推理9138t/s,推理耗时0.5s,输出163t/s

设置Qt路径:
CMakeLists.txt
set(CMAKE_PREFIX_PATH "D:/Qt/Qt5.10.1/5.10.1/msvc2015")
set(Qt5_DIR             "${CMAKE_PREFIX_PATH}/lib/cmake/Qt5")
set(Qt5Widgets_DIR      "${CMAKE_PREFIX_PATH}/lib/cmake/Qt5Widgets")
set(Qt5Gui_DIR          "${CMAKE_PREFIX_PATH}/lib/cmake/Gui")
set(Qt5Core_DIR         "${CMAKE_PREFIX_PATH}/lib/cmake/Qt5Core")
set(Qt5Network_DIR         "${CMAKE_PREFIX_PATH}/lib/cmake/Qt5Network")
set(Qt5LinguistTools_DIR         "${CMAKE_PREFIX_PATH}/lib/cmake/Qt5LinguistTools")
set(QT_QMAKE_EXECUTABLE "${CMAKE_PREFIX_PATH}/bin/qmake.exe")

解决环境问题:
C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V140\Platforms\Win32\Platform.Default.props
Line 26:
v110改为v140
PlatformToolset... v140 PlatformToolset

添加编译类型:
.vscode/cmake-variants.json
{
  "buildType": {
    "default": "debug",
    "description": "The type of build",
    "choices": {
      "debug": {
        "short": "Debug",
        "long": "Emit debug information without optimizing",
        "buildType": "Debug"
      },
      "release": {
        "short": "Release",
        "long": "Optimize for speed without debug information",
        "buildType": "Release"
      },
      "release_debug": {
        "short": "Release_Debug",
        "long": "Optimize with optimization and debug symbols for profiling",
        "buildType": "Release_Debug" 
      }
    }
  }
}