diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 0000000..6c00888
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,2 @@
+[target.x86_64-pc-windows-msvc]
+linker = "lld-link"
diff --git a/.gitattributes b/.gitattributes
index 2bffe04..0db1056 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -5,4 +5,6 @@
# Ignore binary files >:(
*.png binary
-*.jpg binary
\ No newline at end of file
+*.jpg binary
+*.ico binary
+*.icns binary
\ No newline at end of file
diff --git a/.gitea/issue_template/bug_report.yaml b/.gitea/issue_template/bug_report.yaml
index bf17745..24ef890 100644
--- a/.gitea/issue_template/bug_report.yaml
+++ b/.gitea/issue_template/bug_report.yaml
@@ -1,6 +1,6 @@
name: 🐛 Bug Report
description: Something isn't working as expected? Let us know!
-title: '[BUG] - '
+title: "[BUG] - "
labels:
- "status/awaiting triage"
body:
@@ -50,7 +50,7 @@ body:
description: The operating system you are using, including the version/build number.
validations:
required: true
-# Remove this section for non-web apps.
+ # Remove this section for non-web apps.
- type: input
id: browser
attributes:
@@ -66,4 +66,3 @@ body:
- No
validations:
required: true
-
diff --git a/.gitea/issue_template/config.yml b/.gitea/issue_template/config.yml
index f1cdc86..268aa3e 100644
--- a/.gitea/issue_template/config.yml
+++ b/.gitea/issue_template/config.yml
@@ -2,4 +2,4 @@ blank_issues_enabled: false
contact_links:
- name: "Discord"
url: "https://chat.nhcarrigan.com"
- about: "Chat with us directly."
\ No newline at end of file
+ about: "Chat with us directly."
diff --git a/.gitea/issue_template/feature_proposal.yml b/.gitea/issue_template/feature_proposal.yml
index b3fae97..37af46c 100644
--- a/.gitea/issue_template/feature_proposal.yml
+++ b/.gitea/issue_template/feature_proposal.yml
@@ -1,6 +1,6 @@
name: 💭 Feature Proposal
description: Have an idea for how we can improve? Share it here!
-title: '[FEAT] - '
+title: "[FEAT] - "
labels:
- "status/awaiting triage"
body:
diff --git a/.gitea/issue_template/other.yml b/.gitea/issue_template/other.yml
index 2f1335f..51ae0cd 100644
--- a/.gitea/issue_template/other.yml
+++ b/.gitea/issue_template/other.yml
@@ -1,6 +1,6 @@
name: ❓ Other Issue
description: I have something that is neither a bug nor a feature request.
-title: '[OTHER] - '
+title: "[OTHER] - "
labels:
- "status/awaiting triage"
body:
diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml
new file mode 100644
index 0000000..e98af87
--- /dev/null
+++ b/.gitea/workflows/ci.yml
@@ -0,0 +1,201 @@
+name: CI
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+ workflow_dispatch:
+
+jobs:
+ lint-and-test:
+ name: Lint & Test
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Install Linux dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ libwebkit2gtk-4.1-dev \
+ librsvg2-dev \
+ patchelf \
+ libgtk-3-dev \
+ libayatana-appindicator3-dev \
+ libasound2-dev \
+ pkg-config \
+ libclang-dev \
+ cmake
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: 22
+ cache: pnpm
+
+ - name: Install frontend dependencies
+ run: pnpm install
+
+ - name: Run ESLint
+ run: pnpm lint
+
+ - name: Run Prettier check
+ run: pnpm format:check
+
+ - name: Build frontend
+ run: pnpm build
+
+ - name: Run frontend tests
+ run: pnpm test
+
+ - name: Setup Rust
+ uses: dtolnay/rust-toolchain@stable
+ with:
+ components: clippy
+
+ - name: Cache Rust dependencies
+ uses: actions/cache@v4
+ with:
+ path: |
+ ~/.cargo/bin/
+ ~/.cargo/registry/index/
+ ~/.cargo/registry/cache/
+ ~/.cargo/git/db/
+ src-tauri/target/
+ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+
+ - name: Run Clippy
+ working-directory: src-tauri
+ run: cargo clippy --all-targets --all-features -- -D warnings
+
+ - name: Run Rust tests
+ working-directory: src-tauri
+ run: cargo test
+
+ build-linux:
+ name: Build Linux
+ runs-on: ubuntu-latest
+ needs: lint-and-test
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Install Linux dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ libwebkit2gtk-4.1-dev \
+ librsvg2-dev \
+ patchelf \
+ libgtk-3-dev \
+ libayatana-appindicator3-dev \
+ libasound2-dev \
+ pkg-config \
+ libclang-dev \
+ cmake \
+ xdg-utils
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: 22
+ cache: pnpm
+
+ - name: Install frontend dependencies
+ run: pnpm install
+
+ - name: Setup Rust
+ uses: dtolnay/rust-toolchain@stable
+
+ - name: Cache Rust dependencies
+ uses: actions/cache@v4
+ with:
+ path: |
+ ~/.cargo/bin/
+ ~/.cargo/registry/index/
+ ~/.cargo/registry/cache/
+ ~/.cargo/git/db/
+ src-tauri/target/
+ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+
+ - name: Build Linux
+ run: pnpm build:linux
+
+ build-windows:
+ name: Build Windows (cross-compile)
+ runs-on: ubuntu-latest
+ needs: lint-and-test
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Install Linux dependencies for cross-compilation
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ libwebkit2gtk-4.1-dev \
+ librsvg2-dev \
+ patchelf \
+ libgtk-3-dev \
+ libayatana-appindicator3-dev \
+ libasound2-dev \
+ pkg-config \
+ libclang-dev \
+ cmake \
+ clang \
+ lld \
+ llvm \
+ nsis
+
+ - name: Setup pnpm
+ uses: pnpm/action-setup@v4
+ with:
+ version: 9
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: 22
+ cache: pnpm
+
+ - name: Install frontend dependencies
+ run: pnpm install
+
+ - name: Setup Rust
+ uses: dtolnay/rust-toolchain@stable
+ with:
+ targets: x86_64-pc-windows-msvc
+
+ - name: Install cargo-xwin
+ run: |
+ curl -fsSL https://github.com/rust-cross/cargo-xwin/releases/download/v0.20.2/cargo-xwin-v0.20.2.x86_64-unknown-linux-musl.tar.gz | tar xz
+ sudo mv cargo-xwin /usr/local/bin/
+
+ - name: Cache Rust dependencies
+ uses: actions/cache@v4
+ with:
+ path: |
+ ~/.cargo/bin/
+ ~/.cargo/registry/index/
+ ~/.cargo/registry/cache/
+ ~/.cargo/git/db/
+ src-tauri/target/
+ key: ${{ runner.os }}-cargo-windows-${{ hashFiles('**/Cargo.lock') }}
+
+ - name: Build Windows
+ run: pnpm build:windows
diff --git a/.gitea/workflows/security.yml b/.gitea/workflows/security.yml
index 3169f83..e7a3807 100644
--- a/.gitea/workflows/security.yml
+++ b/.gitea/workflows/security.yml
@@ -2,11 +2,11 @@ name: Security Scan and Upload
on:
push:
- branches: [ main ]
+ branches: [main]
pull_request:
- branches: [ main ]
+ branches: [main]
schedule:
- - cron: '0 0 * * 1'
+ - cron: "0 0 * * 1"
workflow_dispatch:
jobs:
@@ -24,18 +24,18 @@ jobs:
env:
DD_URL: ${{ secrets.DD_URL }}
DD_TOKEN: ${{ secrets.DD_TOKEN }}
- PRODUCT_NAME: ${{ github.repository }}
- PRODUCT_TYPE_ID: 1
+ PRODUCT_NAME: ${{ github.repository }}
+ PRODUCT_TYPE_ID: 1
run: |
sudo apt-get install jq -y > /dev/null
-
+
echo "Checking connection to $DD_URL..."
-
+
# Check if product exists - capture HTTP code to debug connection issues
RESPONSE=$(curl --write-out "%{http_code}" --silent --output /tmp/response.json \
-H "Authorization: Token $DD_TOKEN" \
"$DD_URL/api/v2/products/?name=$PRODUCT_NAME")
-
+
# If response is not 200, print error
if [ "$RESPONSE" != "200" ]; then
echo "::error::Failed to query DefectDojo. HTTP Code: $RESPONSE"
@@ -44,7 +44,7 @@ jobs:
fi
COUNT=$(cat /tmp/response.json | jq -r '.count')
-
+
if [ "$COUNT" = "0" ]; then
echo "Creating product '$PRODUCT_NAME'..."
curl -s -X POST "$DD_URL/api/v2/products/" \
@@ -75,7 +75,7 @@ jobs:
echo "Uploading Trivy results..."
# Generate today's date in YYYY-MM-DD format
TODAY=$(date +%Y-%m-%d)
-
+
HTTP_CODE=$(curl --write-out "%{http_code}" --output response.txt --silent -X POST "$DD_URL/api/v2/import-scan/" \
-H "Authorization: Token $DD_TOKEN" \
-F "active=true" \
@@ -86,7 +86,7 @@ jobs:
-F "scan_date=$TODAY" \
-F "auto_create_context=true" \
-F "file=@trivy-results.json")
-
+
if [[ "$HTTP_CODE" != "200" && "$HTTP_CODE" != "201" ]]; then
echo "::error::Upload Failed with HTTP $HTTP_CODE"
echo "--- SERVER RESPONSE ---"
@@ -154,7 +154,7 @@ jobs:
run: |
echo "Uploading Semgrep results..."
TODAY=$(date +%Y-%m-%d)
-
+
HTTP_CODE=$(curl --write-out "%{http_code}" --output response.txt --silent -X POST "$DD_URL/api/v2/import-scan/" \
-H "Authorization: Token $DD_TOKEN" \
-F "active=true" \
@@ -174,4 +174,4 @@ jobs:
exit 1
else
echo "Upload Success!"
- fi
\ No newline at end of file
+ fi
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..97f284b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,61 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+.venv/
+*.egg-info/
+
+# Models - large ML model files
+models/
+src/pretrained_models/
+src-tauri/resources/models/
+*.gguf
+*.bin
+
+# Tauri
+src-tauri/target/
+src-tauri/WixTools/
+src-tauri/resources/
+
+# Build outputs
+build/
+
+# App data
+recordings/
+transcripts/
+summaries/
+
+# Environment
+.env
+*.env.local
+prod.env
diff --git a/.prettierignore b/.prettierignore
new file mode 100644
index 0000000..582afc9
--- /dev/null
+++ b/.prettierignore
@@ -0,0 +1,8 @@
+build/
+.svelte-kit/
+dist/
+src-tauri/target/
+src-tauri/gen/
+node_modules/
+.pnpm-store/
+pnpm-lock.yaml
diff --git a/.prettierrc b/.prettierrc
new file mode 100644
index 0000000..1a88ab1
--- /dev/null
+++ b/.prettierrc
@@ -0,0 +1,7 @@
+{
+ "semi": true,
+ "singleQuote": false,
+ "tabWidth": 2,
+ "trailingComma": "es5",
+ "printWidth": 100
+}
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 0000000..24d7cc6
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,3 @@
+{
+ "recommendations": ["tauri-apps.tauri-vscode", "rust-lang.rust-analyzer"]
+}
diff --git a/PROJECT_INFO.md b/PROJECT_INFO.md
new file mode 100644
index 0000000..16dc1a3
--- /dev/null
+++ b/PROJECT_INFO.md
@@ -0,0 +1,120 @@
+# Chronara - Local Meeting Transcription & Summarization
+
+A Windows desktop application that transcribes, diarizes, and summarizes meetings using only locally-running models.
+
+## Features
+
+- 🎙️ Real-time audio transcription with speaker diarization (WhisperX)
+- 📝 Intelligent meeting summarization (Llama 3.2)
+- 🖥️ Everything runs locally - no cloud services required
+- 📦 All models bundled - no separate downloads needed
+
+## Tech Stack
+
+- **Transcription**: WhisperX (Whisper + speaker diarization)
+- **Summarization**: Llama 3.2 1B/3B
+- **Backend**: Python with FastAPI
+- **Frontend**: Tauri + React
+- **Model Runtime**: llama-cpp-python
+
+## Project Structure
+
+```
+chronara/
+├── src/
+│ ├── backend/ # Python FastAPI backend
+│ ├── components/ # React components
+│ └── App.tsx # Main React app
+├── src-tauri/ # Tauri configuration
+├── models/ # Bundled model files
+├── scripts/ # Build and setup scripts
+└── assets/ # Icons, resources
+```
+
+## Development Setup
+
+### Prerequisites
+
+- Node.js 18+ with pnpm
+- Python 3.10+
+- Rust (for Tauri)
+- Windows build tools (for native modules)
+
+### Installation
+
+1. Clone the repository:
+
+```bash
+git clone https://github.com/naomi-lgbt/chronara.git
+cd chronara
+```
+
+2. Install frontend dependencies:
+
+```bash
+pnpm install
+```
+
+3. Install Python dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+4. Download the AI models:
+
+```bash
+python scripts/download_models.py
+```
+
+5. Run in development mode:
+
+```bash
+pnpm tauri:dev
+```
+
+## Building for Production
+
+### Windows
+
+1. Download models if not already done:
+
+```bash
+python scripts/download_models.py
+```
+
+2. Build the Windows executable:
+
+```bash
+python scripts/build_windows.py
+```
+
+The installer will be created in `src-tauri/target/release/bundle/nsis/`.
+
+## Usage
+
+1. **Start Recording**: Click the "Start Recording" button to begin capturing audio
+2. **Real-time Transcription**: Watch as the conversation is transcribed with speaker labels
+3. **Generate Summary**: After recording, click "Generate Summary" for an AI-powered meeting summary
+4. **Export**: Download both the full transcript and summary as text files
+
+## Model Information
+
+### Transcription (WhisperX)
+
+- **Model**: OpenAI Whisper base model with WhisperX enhancements
+- **Features**: Speaker diarization, timestamp alignment
+- **Size**: ~150MB
+
+### Summarization (Llama 3.2)
+
+- **1B Model**: Fast, good for basic summaries (~1.2GB)
+- **3B Model**: Better quality summaries (~2.5GB)
+- **Format**: GGUF quantized models
+
+## Privacy & Security
+
+- All processing happens locally on your machine
+- No audio or text data is sent to external servers
+- Models are bundled with the application
+- Meeting data stays on your device
diff --git a/README.md b/README.md
index 47a840e..e98cb85 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,6 @@
-# New Repository Template
+# Chronara
-This template contains all of our basic files for a new GitHub repository. There is also a handy workflow that will create an issue on a new repository made from this template, with a checklist for the steps we usually take in setting up a new repository.
-
-If you're starting a Node.JS project with TypeScript, we have a [specific template](https://github.com/naomi-lgbt/nodejs-typescript-template) for that purpose.
-
-## Readme
-
-Delete all of the above text (including this line), and uncomment the below text to use our standard readme template.
-
-
+We may be contacted through our [Chat Server](http://chat.nhcarrigan.com) or via email at `contact@nhcarrigan.com`.
diff --git a/check-all.sh b/check-all.sh
new file mode 100755
index 0000000..30dbedc
--- /dev/null
+++ b/check-all.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+
+echo "🔍 Running all checks..."
+echo "========================================"
+
+echo ""
+echo "📦 Installing dependencies..."
+pnpm install
+
+echo ""
+echo "🔎 Running ESLint..."
+pnpm lint
+
+echo ""
+echo "💅 Running Prettier check..."
+pnpm format:check
+
+echo ""
+echo "🏗️ Building frontend..."
+pnpm build
+
+echo ""
+echo "🧪 Running frontend tests..."
+pnpm test
+
+echo ""
+echo "🦀 Running Clippy..."
+cd src-tauri
+cargo clippy --all-targets --all-features -- -D warnings
+
+echo ""
+echo "🧪 Running Rust tests..."
+cargo test
+
+cd ..
+
+echo ""
+echo "========================================"
+echo "✅ All checks passed!"
diff --git a/eslint.config.js b/eslint.config.js
new file mode 100644
index 0000000..2050d88
--- /dev/null
+++ b/eslint.config.js
@@ -0,0 +1,34 @@
+import js from "@eslint/js";
+import tseslint from "typescript-eslint";
+import reactHooks from "eslint-plugin-react-hooks";
+import reactRefresh from "eslint-plugin-react-refresh";
+import prettier from "eslint-config-prettier";
+import globals from "globals";
+
+export default tseslint.config(
+ js.configs.recommended,
+ ...tseslint.configs.recommended,
+ prettier,
+ {
+ languageOptions: {
+ globals: {
+ ...globals.browser,
+ ...globals.node,
+ },
+ },
+ },
+ {
+ files: ["**/*.{ts,tsx}"],
+ plugins: {
+ "react-hooks": reactHooks,
+ "react-refresh": reactRefresh,
+ },
+ rules: {
+ ...reactHooks.configs.recommended.rules,
+ "react-refresh/only-export-components": ["warn", { allowConstantExport: true }],
+ },
+ },
+ {
+ ignores: ["build/", "dist/", "src-tauri/target/", "node_modules/"],
+ }
+);
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..ff93803
--- /dev/null
+++ b/index.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+ Tauri + React + Typescript
+
+
+
+
+
+
+
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..612aebe
--- /dev/null
+++ b/package.json
@@ -0,0 +1,49 @@
+{
+ "name": "chronara",
+ "private": true,
+ "version": "0.1.0",
+ "type": "module",
+ "scripts": {
+ "dev": "vite",
+ "build": "tsc && vite build",
+ "lint": "eslint src",
+ "lint:fix": "eslint src --fix",
+ "format": "prettier --write .",
+ "format:check": "prettier --check .",
+ "preview": "vite preview",
+ "tauri": "tauri",
+ "tauri:dev": "tauri dev",
+ "build:linux": "tauri build",
+ "build:windows": "./scripts/build-windows-nsis.sh",
+ "build:all": "pnpm build:linux && pnpm build:windows",
+ "test": "vitest run",
+ "test:watch": "vitest",
+ "test:coverage": "vitest run --coverage"
+ },
+ "dependencies": {
+ "@tauri-apps/api": "^2",
+ "@tauri-apps/plugin-opener": "^2",
+ "react": "^19.1.0",
+ "react-dom": "^19.1.0"
+ },
+ "devDependencies": {
+ "@eslint/js": "^9.19.0",
+ "@tauri-apps/cli": "^2",
+ "@testing-library/jest-dom": "^6.9.1",
+ "@testing-library/react": "^16.3.0",
+ "@types/react": "^19.1.8",
+ "@types/react-dom": "^19.1.6",
+ "@vitejs/plugin-react": "^4.6.0",
+ "eslint": "^9.19.0",
+ "eslint-config-prettier": "^10.1.8",
+ "eslint-plugin-react-hooks": "^5.2.0",
+ "eslint-plugin-react-refresh": "^0.4.20",
+ "globals": "^17.0.0",
+ "jsdom": "^27.4.0",
+ "prettier": "^3.8.0",
+ "typescript": "~5.8.3",
+ "typescript-eslint": "^8.53.0",
+ "vite": "^7.0.4",
+ "vitest": "^4.0.17"
+ }
+}
diff --git a/patches/llama-cpp-sys-2/Cargo.lock b/patches/llama-cpp-sys-2/Cargo.lock
new file mode 100644
index 0000000..3ff5401
--- /dev/null
+++ b/patches/llama-cpp-sys-2/Cargo.lock
@@ -0,0 +1,373 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.72.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
+dependencies = [
+ "bitflags",
+ "cexpr",
+ "clang-sys",
+ "itertools",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn",
+]
+
+[[package]]
+name = "bitflags"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+
+[[package]]
+name = "cc"
+version = "1.2.49"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90583009037521a116abf44494efecd645ba48b6622457080f080b85544e2215"
+dependencies = [
+ "find-msvc-tools",
+ "jobserver",
+ "libc",
+ "shlex",
+]
+
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
+[[package]]
+name = "cmake"
+version = "0.1.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b042e5d8a74ae91bb0961acd039822472ec99f8ab0948cbf6d1369588f8be586"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "either"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844"
+
+[[package]]
+name = "find_cuda_helper"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad"
+dependencies = [
+ "glob",
+]
+
+[[package]]
+name = "glob"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.155"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+
+[[package]]
+name = "libloading"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
+dependencies = [
+ "cfg-if",
+ "windows-targets",
+]
+
+[[package]]
+name = "llama-cpp-sys-2"
+version = "0.1.132"
+dependencies = [
+ "bindgen",
+ "cc",
+ "cmake",
+ "find_cuda_helper",
+ "glob",
+ "walkdir",
+]
+
+[[package]]
+name = "log"
+version = "0.4.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.85"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "regex"
+version = "1.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "syn"
+version = "2.0.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
+dependencies = [
+ "windows-sys",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
diff --git a/patches/llama-cpp-sys-2/Cargo.toml b/patches/llama-cpp-sys-2/Cargo.toml
new file mode 100644
index 0000000..075fc13
--- /dev/null
+++ b/patches/llama-cpp-sys-2/Cargo.toml
@@ -0,0 +1,104 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2021"
+name = "llama-cpp-sys-2"
+version = "0.1.132"
+build = "build.rs"
+links = "llama"
+include = [
+ "wrapper.h",
+ "wrapper_mtmd.h",
+ "build.rs",
+ "/src",
+ "/llama.cpp/common/**/*.h",
+ "/llama.cpp/common/**/*.hpp",
+ "/llama.cpp/common/**/*.cpp",
+ "/llama.cpp/ggml/include/*.h",
+ "/llama.cpp/ggml/src/*.h",
+ "/llama.cpp/ggml/src/*.c",
+ "/llama.cpp/ggml/src/*.cpp",
+ "/llama.cpp/src/*.h",
+ "/llama.cpp/src/*.cpp",
+ "/llama.cpp/src/models/*.h",
+ "/llama.cpp/src/models/*.cpp",
+ "/llama.cpp/tools/mtmd/*.h",
+ "/llama.cpp/tools/mtmd/*.cpp",
+ "/llama.cpp/convert_hf_to_gguf.py",
+ "/llama.cpp/common/build-info.cpp.in",
+ "/llama.cpp/ggml/src/ggml-cuda.cu",
+ "/llama.cpp/ggml/src/ggml-metal.m",
+ "/llama.cpp/ggml/src/ggml-metal.metal",
+ "/llama.cpp/include/llama.h",
+ "/llama.cpp/include/llama-cpp.h",
+ "/llama.cpp/ggml/src/ggml-cpu/**/*",
+ "/llama.cpp/ggml/src/ggml-cuda/**/*",
+ "/llama.cpp/ggml/src/ggml-metal/**/*",
+ "/llama.cpp/ggml/src/ggml-vulkan/**/*",
+ "/llama.cpp/ggml/src/llamafile/sgemm.h",
+ "/llama.cpp/ggml/src/llamafile/sgemm.cpp",
+ "/llama.cpp/pocs",
+ "/llama.cpp/vendor",
+ "/llama.cpp/CMakeLists.txt",
+ "/llama.cpp/common/CMakeLists.txt",
+ "/llama.cpp/ggml/CMakeLists.txt",
+ "/llama.cpp/ggml/src/CMakeLists.txt",
+ "/llama.cpp/src/CMakeLists.txt",
+ "/llama.cpp/cmake",
+ "/llama.cpp/ggml/cmake",
+ "/llama.cpp/common/cmake",
+]
+autolib = false
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "Low Level Bindings to llama.cpp"
+readme = "README.md"
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/utilityai/llama-cpp-rs"
+
+[features]
+cuda = []
+cuda-no-vmm = ["cuda"]
+dynamic-link = []
+metal = []
+mtmd = []
+openmp = []
+shared-stdcxx = []
+system-ggml = []
+vulkan = []
+
+[lib]
+name = "llama_cpp_sys_2"
+path = "src/lib.rs"
+
+[dependencies]
+
+[build-dependencies.bindgen]
+version = "0.72.1"
+
+[build-dependencies.cc]
+version = "1.2.49"
+features = ["parallel"]
+
+[build-dependencies.cmake]
+version = "0.1"
+
+[build-dependencies.find_cuda_helper]
+version = "0.2.0"
+
+[build-dependencies.glob]
+version = "0.3.3"
+
+[build-dependencies.walkdir]
+version = "2"
diff --git a/patches/llama-cpp-sys-2/Cargo.toml.orig b/patches/llama-cpp-sys-2/Cargo.toml.orig
new file mode 100644
index 0000000..161901f
--- /dev/null
+++ b/patches/llama-cpp-sys-2/Cargo.toml.orig
@@ -0,0 +1,85 @@
+[package]
+name = "llama-cpp-sys-2"
+description = "Low Level Bindings to llama.cpp"
+version = "0.1.132"
+edition = "2021"
+license = "MIT OR Apache-2.0"
+repository = "https://github.com/utilityai/llama-cpp-rs"
+links = "llama"
+
+include = [
+ "wrapper.h",
+ "wrapper_mtmd.h",
+ "build.rs",
+ "/src",
+
+ "/llama.cpp/common/**/*.h",
+ "/llama.cpp/common/**/*.hpp",
+ "/llama.cpp/common/**/*.cpp",
+ "/llama.cpp/ggml/include/*.h",
+ "/llama.cpp/ggml/src/*.h",
+ "/llama.cpp/ggml/src/*.c",
+ "/llama.cpp/ggml/src/*.cpp",
+ "/llama.cpp/src/*.h",
+ "/llama.cpp/src/*.cpp",
+ "/llama.cpp/src/models/*.h",
+ "/llama.cpp/src/models/*.cpp",
+ "/llama.cpp/tools/mtmd/*.h",
+ "/llama.cpp/tools/mtmd/*.cpp",
+
+ "/llama.cpp/convert_hf_to_gguf.py", # Yes, it's required
+ "/llama.cpp/common/build-info.cpp.in",
+
+ "/llama.cpp/ggml/src/ggml-cuda.cu",
+ "/llama.cpp/ggml/src/ggml-metal.m",
+ "/llama.cpp/ggml/src/ggml-metal.metal",
+
+ "/llama.cpp/include/llama.h",
+ "/llama.cpp/include/llama-cpp.h",
+
+ "/llama.cpp/ggml/src/ggml-cpu/**/*",
+ "/llama.cpp/ggml/src/ggml-cuda/**/*",
+ "/llama.cpp/ggml/src/ggml-metal/**/*",
+ "/llama.cpp/ggml/src/ggml-vulkan/**/*",
+
+ "/llama.cpp/ggml/src/llamafile/sgemm.h",
+ "/llama.cpp/ggml/src/llamafile/sgemm.cpp",
+
+ "/llama.cpp/pocs",
+ "/llama.cpp/vendor",
+
+ "/llama.cpp/CMakeLists.txt",
+ "/llama.cpp/common/CMakeLists.txt",
+ "/llama.cpp/ggml/CMakeLists.txt",
+ "/llama.cpp/ggml/src/CMakeLists.txt",
+ "/llama.cpp/src/CMakeLists.txt",
+
+ "/llama.cpp/cmake",
+ "/llama.cpp/ggml/cmake",
+ "/llama.cpp/common/cmake",
+]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+
+[build-dependencies]
+bindgen = { workspace = true }
+cc = { workspace = true, features = ["parallel"] }
+cmake = "0.1"
+find_cuda_helper = "0.2.0"
+glob = "0.3.3"
+walkdir = "2"
+
+[features]
+cuda = []
+# Disables the need to dynamically link against libcuda.so / cuda.dll
+cuda-no-vmm = ["cuda"]
+metal = []
+dynamic-link = []
+vulkan = []
+openmp = []
+# Only has an impact on Android.
+shared-stdcxx = []
+system-ggml = []
+mtmd = []
diff --git a/patches/llama-cpp-sys-2/README.md b/patches/llama-cpp-sys-2/README.md
new file mode 100644
index 0000000..69dd473
--- /dev/null
+++ b/patches/llama-cpp-sys-2/README.md
@@ -0,0 +1,5 @@
+# llama-cpp-sys
+
+Raw bindings to llama.cpp with cuda support.
+
+See [llama-cpp-2](https://crates.io/crates/llama-cpp-2) for a safe API.
diff --git a/patches/llama-cpp-sys-2/build.rs b/patches/llama-cpp-sys-2/build.rs
new file mode 100644
index 0000000..de22890
--- /dev/null
+++ b/patches/llama-cpp-sys-2/build.rs
@@ -0,0 +1,952 @@
+use cmake::Config;
+use glob::glob;
+use std::env;
+use std::path::{Path, PathBuf};
+use std::process::Command;
+use walkdir::DirEntry;
+
+enum WindowsVariant {
+ Msvc,
+ Other,
+}
+
+enum AppleVariant {
+ MacOS,
+ Other,
+}
+
+enum TargetOs {
+ Windows(WindowsVariant),
+ Apple(AppleVariant),
+ Linux,
+ Android,
+}
+
+macro_rules! debug_log {
+ ($($arg:tt)*) => {
+ if std::env::var("BUILD_DEBUG").is_ok() {
+ println!("cargo:warning=[DEBUG] {}", format!($($arg)*));
+ }
+ };
+}
+
+fn parse_target_os() -> Result<(TargetOs, String), String> {
+ let target = env::var("TARGET").unwrap();
+
+ if target.contains("windows") {
+ if target.ends_with("-windows-msvc") {
+ Ok((TargetOs::Windows(WindowsVariant::Msvc), target))
+ } else {
+ Ok((TargetOs::Windows(WindowsVariant::Other), target))
+ }
+ } else if target.contains("apple") {
+ if target.ends_with("-apple-darwin") {
+ Ok((TargetOs::Apple(AppleVariant::MacOS), target))
+ } else {
+ Ok((TargetOs::Apple(AppleVariant::Other), target))
+ }
+ } else if target.contains("android")
+ || target == "aarch64-linux-android"
+ || target == "armv7-linux-androideabi"
+ || target == "i686-linux-android"
+ || target == "x86_64-linux-android"
+ {
+ // Handle both full android targets and short names like arm64-v8a that cargo ndk might use
+ Ok((TargetOs::Android, target))
+ } else if target.contains("linux") {
+ Ok((TargetOs::Linux, target))
+ } else {
+ Err(target)
+ }
+}
+
+fn get_cargo_target_dir() -> Result> {
+ let out_dir = env::var("OUT_DIR")?;
+ let path = PathBuf::from(out_dir);
+ let target_dir = path
+ .ancestors()
+ .nth(3)
+ .ok_or("OUT_DIR is not deep enough")?;
+ Ok(target_dir.to_path_buf())
+}
+
+fn extract_lib_names(out_dir: &Path, build_shared_libs: bool) -> Vec {
+ // Use CARGO_CFG_TARGET_OS to detect TARGET platform, not HOST
+ // This fixes cross-compilation from Linux to Windows
+ let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
+ let lib_pattern = if target_os == "windows" {
+ "*.lib"
+ } else if target_os == "macos" {
+ if build_shared_libs {
+ "*.dylib"
+ } else {
+ "*.a"
+ }
+ } else if build_shared_libs {
+ "*.so"
+ } else {
+ "*.a"
+ };
+ let libs_dir = out_dir.join("lib*");
+ let pattern = libs_dir.join(lib_pattern);
+ debug_log!("Extract libs {}", pattern.display());
+
+ let mut lib_names: Vec = Vec::new();
+
+ // Process the libraries based on the pattern
+ for entry in glob(pattern.to_str().unwrap()).unwrap() {
+ match entry {
+ Ok(path) => {
+ let stem = path.file_stem().unwrap();
+ let stem_str = stem.to_str().unwrap();
+
+ // Remove the "lib" prefix if present
+ let lib_name = if stem_str.starts_with("lib") {
+ stem_str.strip_prefix("lib").unwrap_or(stem_str)
+ } else {
+ if path.extension() == Some(std::ffi::OsStr::new("a")) {
+ let target = path.parent().unwrap().join(format!("lib{}.a", stem_str));
+ std::fs::rename(&path, &target).unwrap_or_else(|e| {
+ panic!("Failed to rename {path:?} to {target:?}: {e:?}");
+ })
+ }
+ stem_str
+ };
+ lib_names.push(lib_name.to_string());
+ }
+ Err(e) => println!("cargo:warning=error={}", e),
+ }
+ }
+ lib_names
+}
+
+fn extract_lib_assets(out_dir: &Path) -> Vec {
+ // Use CARGO_CFG_TARGET_OS to detect TARGET platform, not HOST
+ // This fixes cross-compilation from Linux to Windows
+ let target_os = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
+ let shared_lib_pattern = if target_os == "windows" {
+ "*.dll"
+ } else if target_os == "macos" {
+ "*.dylib"
+ } else {
+ "*.so"
+ };
+
+ let shared_libs_dir = if target_os == "windows" { "bin" } else { "lib" };
+ let libs_dir = out_dir.join(shared_libs_dir);
+ let pattern = libs_dir.join(shared_lib_pattern);
+ debug_log!("Extract lib assets {}", pattern.display());
+ let mut files = Vec::new();
+
+ for entry in glob(pattern.to_str().unwrap()).unwrap() {
+ match entry {
+ Ok(path) => {
+ files.push(path);
+ }
+ Err(e) => eprintln!("cargo:warning=error={}", e),
+ }
+ }
+
+ files
+}
+
+fn macos_link_search_path() -> Option {
+ let output = Command::new("clang")
+ .arg("--print-search-dirs")
+ .output()
+ .ok()?;
+ if !output.status.success() {
+ println!(
+ "failed to run 'clang --print-search-dirs', continuing without a link search path"
+ );
+ return None;
+ }
+
+ let stdout = String::from_utf8_lossy(&output.stdout);
+ for line in stdout.lines() {
+ if line.contains("libraries: =") {
+ let path = line.split('=').nth(1)?;
+ return Some(format!("{}/lib/darwin", path));
+ }
+ }
+
+ println!("failed to determine link search path, continuing without it");
+ None
+}
+
+fn validate_android_ndk(ndk_path: &str) -> Result<(), String> {
+ let ndk_path = Path::new(ndk_path);
+
+ if !ndk_path.exists() {
+ return Err(format!(
+ "Android NDK path does not exist: {}",
+ ndk_path.display()
+ ));
+ }
+
+ let toolchain_file = ndk_path.join("build/cmake/android.toolchain.cmake");
+ if !toolchain_file.exists() {
+ return Err(format!(
+ "Android NDK toolchain file not found: {}\n\
+ This indicates an incomplete NDK installation.",
+ toolchain_file.display()
+ ));
+ }
+
+ Ok(())
+}
+
+fn is_hidden(e: &DirEntry) -> bool {
+ e.file_name()
+ .to_str()
+ .map(|s| s.starts_with('.'))
+ .unwrap_or_default()
+}
+
+fn main() {
+ println!("cargo:rerun-if-changed=build.rs");
+
+ let (target_os, target_triple) =
+ parse_target_os().unwrap_or_else(|t| panic!("Failed to parse target os {t}"));
+ let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+
+ let target_dir = get_cargo_target_dir().unwrap();
+ let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get CARGO_MANIFEST_DIR");
+ let llama_src = Path::new(&manifest_dir).join("llama.cpp");
+ let build_shared_libs = cfg!(feature = "dynamic-link");
+
+ let build_shared_libs = std::env::var("LLAMA_BUILD_SHARED_LIBS")
+ .map(|v| v == "1")
+ .unwrap_or(build_shared_libs);
+ let profile = env::var("LLAMA_LIB_PROFILE").unwrap_or("Release".to_string());
+ let static_crt = env::var("LLAMA_STATIC_CRT")
+ .map(|v| v == "1")
+ .unwrap_or(false);
+
+ println!("cargo:rerun-if-env-changed=LLAMA_LIB_PROFILE");
+ println!("cargo:rerun-if-env-changed=LLAMA_BUILD_SHARED_LIBS");
+ println!("cargo:rerun-if-env-changed=LLAMA_STATIC_CRT");
+
+ debug_log!("TARGET: {}", target_triple);
+ debug_log!("CARGO_MANIFEST_DIR: {}", manifest_dir);
+ debug_log!("TARGET_DIR: {}", target_dir.display());
+ debug_log!("OUT_DIR: {}", out_dir.display());
+ debug_log!("BUILD_SHARED: {}", build_shared_libs);
+
+ // Make sure that changes to the llama.cpp project trigger a rebuild.
+ let rebuild_on_children_of = [
+ llama_src.join("src"),
+ llama_src.join("ggml/src"),
+ llama_src.join("common"),
+ ];
+ for entry in walkdir::WalkDir::new(&llama_src)
+ .into_iter()
+ .filter_entry(|e| !is_hidden(e))
+ {
+ let entry = entry.expect("Failed to obtain entry");
+ let rebuild = entry
+ .file_name()
+ .to_str()
+ .map(|f| f.starts_with("CMake"))
+ .unwrap_or_default()
+ || rebuild_on_children_of
+ .iter()
+ .any(|src_folder| entry.path().starts_with(src_folder));
+ if rebuild {
+ println!("cargo:rerun-if-changed={}", entry.path().display());
+ }
+ }
+
+ // Speed up build
+ env::set_var(
+ "CMAKE_BUILD_PARALLEL_LEVEL",
+ std::thread::available_parallelism()
+ .unwrap()
+ .get()
+ .to_string(),
+ );
+
+ // Bindings
+ let mut bindings_builder = bindgen::Builder::default()
+ .header("wrapper.h")
+ .clang_arg(format!("-I{}", llama_src.join("include").display()))
+ .clang_arg(format!("-I{}", llama_src.join("ggml/include").display()))
+ .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+ .derive_partialeq(true)
+ .allowlist_function("ggml_.*")
+ .allowlist_type("ggml_.*")
+ .allowlist_function("llama_.*")
+ .allowlist_type("llama_.*")
+ .prepend_enum_name(false);
+
+ // Configure mtmd feature if enabled
+ if cfg!(feature = "mtmd") {
+ bindings_builder = bindings_builder
+ .header("wrapper_mtmd.h")
+ .allowlist_function("mtmd_.*")
+ .allowlist_type("mtmd_.*");
+ }
+
+ // Configure Android-specific bindgen settings
+ if matches!(target_os, TargetOs::Android) {
+ // Detect Android NDK from environment variables
+ let android_ndk = env::var("ANDROID_NDK")
+ .or_else(|_| env::var("ANDROID_NDK_ROOT"))
+ .or_else(|_| env::var("NDK_ROOT"))
+ .or_else(|_| env::var("CARGO_NDK_ANDROID_NDK"))
+ .or_else(|_| {
+ // Try to auto-detect NDK from Android SDK
+ if let Some(home) = env::home_dir() {
+ let android_home = env::var("ANDROID_HOME")
+ .or_else(|_| env::var("ANDROID_SDK_ROOT"))
+ .unwrap_or_else(|_| format!("{}/Android/Sdk", home.display()));
+
+ let ndk_dir = format!("{}/ndk", android_home);
+ if let Ok(entries) = std::fs::read_dir(&ndk_dir) {
+ let mut versions: Vec<_> = entries
+ .filter_map(|e| e.ok())
+ .filter(|e| e.file_type().map(|t| t.is_dir()).unwrap_or(false))
+ .filter_map(|e| e.file_name().to_str().map(|s| s.to_string()))
+ .collect();
+ versions.sort();
+ if let Some(latest) = versions.last() {
+ return Ok(format!("{}/{}", ndk_dir, latest));
+ }
+ }
+ }
+ Err(env::VarError::NotPresent)
+ })
+ .unwrap_or_else(|_| {
+ panic!(
+ "Android NDK not found. Please set one of: ANDROID_NDK, NDK_ROOT, ANDROID_NDK_ROOT\n\
+ Current target: {}\n\
+ Download from: https://developer.android.com/ndk/downloads",
+ target_triple
+ );
+ });
+
+ // Get Android API level
+ let android_api = env::var("ANDROID_API_LEVEL")
+ .or_else(|_| env::var("ANDROID_PLATFORM").map(|p| p.replace("android-", "")))
+ .or_else(|_| env::var("CARGO_NDK_ANDROID_PLATFORM").map(|p| p.replace("android-", "")))
+ .unwrap_or_else(|_| "28".to_string());
+
+ // Determine host platform
+ let host_tag = if cfg!(target_os = "macos") {
+ "darwin-x86_64"
+ } else if cfg!(target_os = "linux") {
+ "linux-x86_64"
+ } else if cfg!(target_os = "windows") {
+ "windows-x86_64"
+ } else {
+ panic!("Unsupported host platform for Android NDK");
+ };
+
+ // Map Rust target to Android architecture
+ let android_target_prefix = if target_triple.contains("aarch64") {
+ "aarch64-linux-android"
+ } else if target_triple.contains("armv7") {
+ "arm-linux-androideabi"
+ } else if target_triple.contains("x86_64") {
+ "x86_64-linux-android"
+ } else if target_triple.contains("i686") {
+ "i686-linux-android"
+ } else {
+ panic!("Unsupported Android target: {}", target_triple);
+ };
+
+ // Setup Android toolchain paths
+ let toolchain_path = format!("{}/toolchains/llvm/prebuilt/{}", android_ndk, host_tag);
+ let sysroot = format!("{}/sysroot", toolchain_path);
+
+ // Validate toolchain existence
+ if !std::path::Path::new(&toolchain_path).exists() {
+ panic!(
+ "Android NDK toolchain not found at: {}\n\
+ Please ensure you have the correct Android NDK for your platform.",
+ toolchain_path
+ );
+ }
+
+ // Find clang builtin includes
+ let clang_builtin_includes = {
+ let clang_lib_path = format!("{}/lib/clang", toolchain_path);
+ std::fs::read_dir(&clang_lib_path).ok().and_then(|entries| {
+ entries
+ .filter_map(|e| e.ok())
+ .find(|entry| {
+ entry.file_type().map(|t| t.is_dir()).unwrap_or(false)
+ && entry
+ .file_name()
+ .to_str()
+ .map(|name| name.chars().next().unwrap_or('0').is_ascii_digit())
+ .unwrap_or(false)
+ })
+ .and_then(|entry| {
+ let include_path =
+ format!("{}/{}/include", clang_lib_path, entry.file_name().to_str()?);
+ if std::path::Path::new(&include_path).exists() {
+ Some(include_path)
+ } else {
+ None
+ }
+ })
+ })
+ };
+
+ // Configure bindgen for Android
+ bindings_builder = bindings_builder
+ .clang_arg(format!("--sysroot={}", sysroot))
+ .clang_arg(format!("-D__ANDROID_API__={}", android_api))
+ .clang_arg("-D__ANDROID__");
+
+ // Add include paths in correct order
+ if let Some(ref builtin_includes) = clang_builtin_includes {
+ bindings_builder = bindings_builder
+ .clang_arg("-isystem")
+ .clang_arg(builtin_includes);
+ }
+
+ bindings_builder = bindings_builder
+ .clang_arg("-isystem")
+ .clang_arg(format!("{}/usr/include/{}", sysroot, android_target_prefix))
+ .clang_arg("-isystem")
+ .clang_arg(format!("{}/usr/include", sysroot))
+ .clang_arg("-include")
+ .clang_arg("stdbool.h")
+ .clang_arg("-include")
+ .clang_arg("stdint.h");
+
+ // Set additional clang args for cargo ndk compatibility
+ if env::var("CARGO_SUBCOMMAND").as_deref() == Ok("ndk") {
+ std::env::set_var(
+ "BINDGEN_EXTRA_CLANG_ARGS",
+ format!("--target={}", target_triple),
+ );
+ }
+ }
+
+ // Fix bindgen header discovery on Windows MSVC
+ // Use cc crate to discover MSVC include paths by compiling a dummy file
+ if matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc)) {
+ // Create a minimal dummy C file to extract compiler flags
+ let out_dir = env::var("OUT_DIR").unwrap();
+ let dummy_c = Path::new(&out_dir).join("dummy.c");
+ std::fs::write(&dummy_c, "int main() { return 0; }").unwrap();
+
+ // Use cc crate to get compiler with proper environment setup
+ let mut build = cc::Build::new();
+ build.file(&dummy_c);
+
+ // Get the actual compiler command cc would use
+ let compiler = build.try_get_compiler().unwrap();
+
+ // Extract include paths by checking compiler's environment
+ // cc crate sets up MSVC environment internally
+ let env_include = compiler
+ .env()
+ .iter()
+ .find(|(k, _)| k.eq_ignore_ascii_case("INCLUDE"))
+ .map(|(_, v)| v);
+
+ if let Some(include_paths) = env_include {
+ for include_path in include_paths
+ .to_string_lossy()
+ .split(';')
+ .filter(|s| !s.is_empty())
+ {
+ bindings_builder = bindings_builder
+ .clang_arg("-isystem")
+ .clang_arg(include_path);
+ debug_log!("Added MSVC include path: {}", include_path);
+ }
+ }
+
+ // Add MSVC compatibility flags
+ bindings_builder = bindings_builder
+ .clang_arg(format!("--target={}", target_triple))
+ .clang_arg("-fms-compatibility")
+ .clang_arg("-fms-extensions");
+
+ debug_log!(
+ "Configured bindgen with MSVC toolchain for target: {}",
+ target_triple
+ );
+ }
+ let bindings = bindings_builder
+ .generate()
+ .expect("Failed to generate bindings");
+
+ // Write the generated bindings to an output file
+ let bindings_path = out_dir.join("bindings.rs");
+ bindings
+ .write_to_file(bindings_path)
+ .expect("Failed to write bindings");
+
+ println!("cargo:rerun-if-changed=wrapper.h");
+ println!("cargo:rerun-if-changed=wrapper_mtmd.h");
+
+ debug_log!("Bindings Created");
+
+ // Build with Cmake
+
+ let mut config = Config::new(&llama_src);
+
+ // Would require extra source files to pointlessly
+ // be included in what's uploaded to and downloaded from
+ // crates.io, so deactivating these instead
+ config.define("LLAMA_BUILD_TESTS", "OFF");
+ config.define("LLAMA_BUILD_EXAMPLES", "OFF");
+ config.define("LLAMA_BUILD_SERVER", "OFF");
+ config.define("LLAMA_BUILD_TOOLS", "OFF");
+ config.define("LLAMA_CURL", "OFF");
+
+ if cfg!(feature = "mtmd") {
+ config.define("LLAMA_BUILD_COMMON", "ON");
+ // mtmd support in llama-cpp is within the tools directory
+ config.define("LLAMA_BUILD_TOOLS", "ON");
+ }
+
+ // Pass CMAKE_ environment variables down to CMake
+ for (key, value) in env::vars() {
+ if key.starts_with("CMAKE_") {
+ config.define(&key, &value);
+ }
+ }
+
+ // extract the target-cpu config value, if specified
+ let target_cpu = std::env::var("CARGO_ENCODED_RUSTFLAGS")
+ .ok()
+ .and_then(|rustflags| {
+ rustflags
+ .split('\x1f')
+ .find(|f| f.contains("target-cpu="))
+ .and_then(|f| f.split("target-cpu=").nth(1))
+ .map(|s| s.to_string())
+ });
+
+ if target_cpu == Some("native".into()) {
+ debug_log!("Detected target-cpu=native, compiling with GGML_NATIVE");
+ config.define("GGML_NATIVE", "ON");
+ }
+ // if native isn't specified, enable specific features for ggml instead
+ else {
+ // rust code isn't using `target-cpu=native`, so llama.cpp shouldn't use GGML_NATIVE either
+ config.define("GGML_NATIVE", "OFF");
+
+ // if `target-cpu` is set set, also set -march for llama.cpp to the same value
+ if let Some(ref cpu) = target_cpu {
+ debug_log!("Setting baseline architecture: -march={}", cpu);
+ config.cflag(&format!("-march={}", cpu));
+ config.cxxflag(&format!("-march={}", cpu));
+ }
+
+ // I expect this env var to always be present
+ let features = std::env::var("CARGO_CFG_TARGET_FEATURE")
+ .expect("Env var CARGO_CFG_TARGET_FEATURE not found.");
+ debug_log!("Compiling with target features: {}", features);
+
+ // list of rust target_features here:
+ // https://doc.rust-lang.org/reference/attributes/codegen.html#the-target_feature-attribute
+ // GGML config flags have been found by looking at:
+ // llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
+ for feature in features.split(',') {
+ match feature {
+ "avx" => {
+ config.define("GGML_AVX", "ON");
+ }
+ "avx2" => {
+ config.define("GGML_AVX2", "ON");
+ }
+ "avx512bf16" => {
+ config.define("GGML_AVX512_BF16", "ON");
+ }
+ "avx512vbmi" => {
+ config.define("GGML_AVX512_VBMI", "ON");
+ }
+ "avx512vnni" => {
+ config.define("GGML_AVX512_VNNI", "ON");
+ }
+ "avxvnni" => {
+ config.define("GGML_AVX_VNNI", "ON");
+ }
+ "bmi2" => {
+ config.define("GGML_BMI2", "ON");
+ }
+ "f16c" => {
+ config.define("GGML_F16C", "ON");
+ }
+ "fma" => {
+ config.define("GGML_FMA", "ON");
+ }
+ "sse4.2" => {
+ config.define("GGML_SSE42", "ON");
+ }
+ _ => {
+ debug_log!(
+ "Unrecognized cpu feature: '{}' - skipping GGML config for it.",
+ feature
+ );
+ continue;
+ }
+ };
+ }
+ }
+
+ config.define(
+ "BUILD_SHARED_LIBS",
+ if build_shared_libs { "ON" } else { "OFF" },
+ );
+
+ if matches!(target_os, TargetOs::Apple(_)) {
+ config.define("GGML_BLAS", "OFF");
+ }
+
+ if (matches!(target_os, TargetOs::Windows(WindowsVariant::Msvc))
+ && matches!(
+ profile.as_str(),
+ "Release" | "RelWithDebInfo" | "MinSizeRel"
+ ))
+ {
+ // Debug Rust builds under MSVC turn off optimization even though we're ideally building the release profile of llama.cpp.
+ // Looks like an upstream bug:
+ // https://github.com/rust-lang/cmake-rs/issues/240
+ // For now explicitly reinject the optimization flags that a CMake Release build is expected to have on in this scenario.
+ // This fixes CPU inference performance when part of a Rust debug build.
+ for flag in &["/O2", "/DNDEBUG", "/Ob2"] {
+ config.cflag(flag);
+ config.cxxflag(flag);
+ }
+ }
+
+ config.static_crt(static_crt);
+
+ if matches!(target_os, TargetOs::Android) {
+ // Android NDK Build Configuration
+ let android_ndk = env::var("ANDROID_NDK")
+ .or_else(|_| env::var("NDK_ROOT"))
+ .or_else(|_| env::var("ANDROID_NDK_ROOT"))
+ .unwrap_or_else(|_| {
+ panic!(
+ "Android NDK not found. Please set one of: ANDROID_NDK, NDK_ROOT, ANDROID_NDK_ROOT\n\
+ Download from: https://developer.android.com/ndk/downloads"
+ );
+ });
+
+ // Validate NDK installation
+ if let Err(error) = validate_android_ndk(&android_ndk) {
+ panic!("Android NDK validation failed: {}", error);
+ }
+
+ // Rerun build script if NDK environment variables change
+ println!("cargo:rerun-if-env-changed=ANDROID_NDK");
+ println!("cargo:rerun-if-env-changed=NDK_ROOT");
+ println!("cargo:rerun-if-env-changed=ANDROID_NDK_ROOT");
+
+ // Set CMake toolchain file for Android
+ let toolchain_file = format!("{}/build/cmake/android.toolchain.cmake", android_ndk);
+ config.define("CMAKE_TOOLCHAIN_FILE", &toolchain_file);
+
+ // Configure Android platform (API level)
+ let android_platform = env::var("ANDROID_PLATFORM").unwrap_or_else(|_| {
+ env::var("ANDROID_API_LEVEL")
+ .map(|level| format!("android-{}", level))
+ .unwrap_or_else(|_| "android-28".to_string())
+ });
+
+ println!("cargo:rerun-if-env-changed=ANDROID_PLATFORM");
+ println!("cargo:rerun-if-env-changed=ANDROID_API_LEVEL");
+ config.define("ANDROID_PLATFORM", &android_platform);
+
+ // Map Rust target to Android ABI
+ let android_abi = if target_triple.contains("aarch64") {
+ "arm64-v8a"
+ } else if target_triple.contains("armv7") {
+ "armeabi-v7a"
+ } else if target_triple.contains("x86_64") {
+ "x86_64"
+ } else if target_triple.contains("i686") {
+ "x86"
+ } else {
+ panic!(
+ "Unsupported Android target: {}\n\
+ Supported targets: aarch64-linux-android, armv7-linux-androideabi, i686-linux-android, x86_64-linux-android",
+ target_triple
+ );
+ };
+
+ config.define("ANDROID_ABI", android_abi);
+
+ // Configure architecture-specific compiler flags
+ match android_abi {
+ "arm64-v8a" => {
+ config.cflag("-march=armv8-a");
+ config.cxxflag("-march=armv8-a");
+ }
+ "armeabi-v7a" => {
+ config.cflag("-march=armv7-a");
+ config.cxxflag("-march=armv7-a");
+ config.cflag("-mfpu=neon");
+ config.cxxflag("-mfpu=neon");
+ config.cflag("-mthumb");
+ config.cxxflag("-mthumb");
+ }
+ "x86_64" => {
+ config.cflag("-march=x86-64");
+ config.cxxflag("-march=x86-64");
+ }
+ "x86" => {
+ config.cflag("-march=i686");
+ config.cxxflag("-march=i686");
+ }
+ _ => {}
+ }
+
+ // Android-specific CMake configurations
+ config.define("GGML_LLAMAFILE", "OFF");
+
+ // Link Android system libraries
+ println!("cargo:rustc-link-lib=log");
+ println!("cargo:rustc-link-lib=android");
+ }
+
+ if matches!(target_os, TargetOs::Linux)
+ && target_triple.contains("aarch64")
+ && target_cpu != Some("native".into())
+ {
+ // If the target-cpu is not specified as native, we take off the native ARM64 support.
+ // It is useful in docker environments where the native feature is not enabled.
+ config.define("GGML_NATIVE", "OFF");
+ config.define("GGML_CPU_ARM_ARCH", "armv8-a");
+ }
+
+ if cfg!(feature = "vulkan") {
+ config.define("GGML_VULKAN", "ON");
+ match target_os {
+ TargetOs::Windows(_) => {
+ let vulkan_path = env::var("VULKAN_SDK").expect(
+ "Please install Vulkan SDK and ensure that VULKAN_SDK env variable is set",
+ );
+ let vulkan_lib_path = Path::new(&vulkan_path).join("Lib");
+ println!("cargo:rustc-link-search={}", vulkan_lib_path.display());
+ println!("cargo:rustc-link-lib=vulkan-1");
+
+ // workaround for this error: "FileTracker : error FTK1011: could not create the new file tracking log file"
+ // it has to do with MSBuild FileTracker not respecting the path
+ // limit configuration set in the windows registry.
+ // I'm not sure why that's a thing, but this makes my builds work.
+ // (crates that depend on llama-cpp-rs w/ vulkan easily exceed the default PATH_MAX on windows)
+ env::set_var("TrackFileAccess", "false");
+ // since we disabled TrackFileAccess, we can now run into problems with parallel
+ // access to pdb files. /FS solves this.
+ config.cflag("/FS");
+ config.cxxflag("/FS");
+ }
+ TargetOs::Linux => {
+ // If we are not using system provided vulkan SDK, add vulkan libs for linking
+ if let Ok(vulkan_path) = env::var("VULKAN_SDK") {
+ let vulkan_lib_path = Path::new(&vulkan_path).join("lib");
+ println!("cargo:rustc-link-search={}", vulkan_lib_path.display());
+ }
+ println!("cargo:rustc-link-lib=vulkan");
+ }
+ _ => (),
+ }
+ }
+
+ if cfg!(feature = "cuda") {
+ config.define("GGML_CUDA", "ON");
+
+ if cfg!(feature = "cuda-no-vmm") {
+ config.define("GGML_CUDA_NO_VMM", "ON");
+ }
+ }
+
+ // Android doesn't have OpenMP support AFAICT and openmp is a default feature. Do this here
+ // rather than modifying the defaults in Cargo.toml just in case someone enables the OpenMP feature
+ // and tries to build for Android anyway.
+ if cfg!(feature = "openmp") && !matches!(target_os, TargetOs::Android) {
+ config.define("GGML_OPENMP", "ON");
+ } else {
+ config.define("GGML_OPENMP", "OFF");
+ }
+
+ if cfg!(feature = "system-ggml") {
+ config.define("LLAMA_USE_SYSTEM_GGML", "ON");
+ }
+
+ // General
+ config
+ .profile(&profile)
+ .very_verbose(std::env::var("CMAKE_VERBOSE").is_ok()) // Not verbose by default
+ .always_configure(false);
+
+ let build_dir = config.build();
+
+ // Search paths
+ println!("cargo:rustc-link-search={}", out_dir.join("lib").display());
+ println!(
+ "cargo:rustc-link-search={}",
+ out_dir.join("lib64").display()
+ );
+ println!("cargo:rustc-link-search={}", build_dir.display());
+
+ if cfg!(feature = "system-ggml") {
+ // Extract library directory from CMake's found GGML package
+ let cmake_cache = build_dir.join("build").join("CMakeCache.txt");
+ if let Ok(cache_contents) = std::fs::read_to_string(&cmake_cache) {
+ let mut ggml_lib_dirs = std::collections::HashSet::new();
+
+ // Parse CMakeCache.txt to find where GGML libraries were found
+ for line in cache_contents.lines() {
+ if line.starts_with("GGML_LIBRARY:")
+ || line.starts_with("GGML_BASE_LIBRARY:")
+ || line.starts_with("GGML_CPU_LIBRARY:")
+ {
+ if let Some(lib_path) = line.split('=').nth(1) {
+ if let Some(parent) = Path::new(lib_path).parent() {
+ ggml_lib_dirs.insert(parent.to_path_buf());
+ }
+ }
+ }
+ }
+
+ // Add each unique library directory to the search path
+ for lib_dir in ggml_lib_dirs {
+ println!("cargo:rustc-link-search=native={}", lib_dir.display());
+ debug_log!("Added system GGML library path: {}", lib_dir.display());
+ }
+ }
+ }
+
+ if cfg!(feature = "cuda") && !build_shared_libs {
+ // Re-run build script if CUDA_PATH environment variable changes
+ println!("cargo:rerun-if-env-changed=CUDA_PATH");
+
+ // Add CUDA library directories to the linker search path
+ for lib_dir in find_cuda_helper::find_cuda_lib_dirs() {
+ println!("cargo:rustc-link-search=native={}", lib_dir.display());
+ }
+
+ // Platform-specific linking
+ if cfg!(target_os = "windows") {
+ // ✅ On Windows, use dynamic linking.
+ // Static linking is problematic because NVIDIA does not provide culibos.lib,
+ // and static CUDA libraries (like cublas_static.lib) are usually not shipped.
+
+ println!("cargo:rustc-link-lib=cudart"); // Links to cudart64_*.dll
+ println!("cargo:rustc-link-lib=cublas"); // Links to cublas64_*.dll
+ println!("cargo:rustc-link-lib=cublasLt"); // Links to cublasLt64_*.dll
+
+ // Link to CUDA driver API (nvcuda.dll via cuda.lib)
+ if !cfg!(feature = "cuda-no-vmm") {
+ println!("cargo:rustc-link-lib=cuda");
+ }
+ } else {
+ // ✅ On non-Windows platforms (e.g., Linux), static linking is preferred and supported.
+ // Static libraries like cudart_static and cublas_static depend on culibos.
+
+ println!("cargo:rustc-link-lib=static=cudart_static");
+ println!("cargo:rustc-link-lib=static=cublas_static");
+ println!("cargo:rustc-link-lib=static=cublasLt_static");
+
+ // Link to CUDA driver API (libcuda.so)
+ if !cfg!(feature = "cuda-no-vmm") {
+ println!("cargo:rustc-link-lib=cuda");
+ }
+
+ // culibos is required when statically linking cudart_static
+ println!("cargo:rustc-link-lib=static=culibos");
+ }
+ }
+
+ // Link libraries
+ let llama_libs_kind = if build_shared_libs || cfg!(feature = "system-ggml") {
+ "dylib"
+ } else {
+ "static"
+ };
+ let llama_libs = extract_lib_names(&out_dir, build_shared_libs);
+ assert_ne!(llama_libs.len(), 0);
+
+ if cfg!(feature = "system-ggml") {
+ println!("cargo:rustc-link-lib={llama_libs_kind}=ggml");
+ println!("cargo:rustc-link-lib={llama_libs_kind}=ggml-base");
+ println!("cargo:rustc-link-lib={llama_libs_kind}=ggml-cpu");
+ }
+ for lib in llama_libs {
+ let link = format!("cargo:rustc-link-lib={}={}", llama_libs_kind, lib);
+ debug_log!("LINK {link}",);
+ println!("{link}",);
+ }
+
+ // OpenMP
+ if cfg!(feature = "openmp") && target_triple.contains("gnu") {
+ println!("cargo:rustc-link-lib=gomp");
+ }
+
+ match target_os {
+ TargetOs::Windows(WindowsVariant::Msvc) => {
+ println!("cargo:rustc-link-lib=advapi32");
+ if cfg!(debug_assertions) {
+ println!("cargo:rustc-link-lib=dylib=msvcrtd");
+ }
+ }
+ TargetOs::Linux => {
+ println!("cargo:rustc-link-lib=dylib=stdc++");
+ }
+ TargetOs::Apple(variant) => {
+ println!("cargo:rustc-link-lib=framework=Foundation");
+ println!("cargo:rustc-link-lib=framework=Metal");
+ println!("cargo:rustc-link-lib=framework=MetalKit");
+ println!("cargo:rustc-link-lib=framework=Accelerate");
+ println!("cargo:rustc-link-lib=c++");
+
+ match variant {
+ AppleVariant::MacOS => {
+ // On (older) OSX we need to link against the clang runtime,
+ // which is hidden in some non-default path.
+ //
+ // More details at https://github.com/alexcrichton/curl-rust/issues/279.
+ if let Some(path) = macos_link_search_path() {
+ println!("cargo:rustc-link-lib=clang_rt.osx");
+ println!("cargo:rustc-link-search={}", path);
+ }
+ }
+ AppleVariant::Other => (),
+ }
+ }
+ _ => (),
+ }
+
+ // copy DLLs to target
+ if build_shared_libs {
+ let libs_assets = extract_lib_assets(&out_dir);
+ for asset in libs_assets {
+ let asset_clone = asset.clone();
+ let filename = asset_clone.file_name().unwrap();
+ let filename = filename.to_str().unwrap();
+ let dst = target_dir.join(filename);
+ debug_log!("HARD LINK {} TO {}", asset.display(), dst.display());
+ if !dst.exists() {
+ std::fs::hard_link(asset.clone(), dst).unwrap();
+ }
+
+ // Copy DLLs to examples as well
+ if target_dir.join("examples").exists() {
+ let dst = target_dir.join("examples").join(filename);
+ debug_log!("HARD LINK {} TO {}", asset.display(), dst.display());
+ if !dst.exists() {
+ std::fs::hard_link(asset.clone(), dst).unwrap();
+ }
+ }
+
+ // Copy DLLs to target/profile/deps as well for tests
+ let dst = target_dir.join("deps").join(filename);
+ debug_log!("HARD LINK {} TO {}", asset.display(), dst.display());
+ if !dst.exists() {
+ std::fs::hard_link(asset.clone(), dst).unwrap();
+ }
+ }
+ }
+}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/CMakeLists.txt b/patches/llama-cpp-sys-2/llama.cpp/CMakeLists.txt
new file mode 100644
index 0000000..44c2166
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/CMakeLists.txt
@@ -0,0 +1,309 @@
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+project("llama.cpp" C CXX)
+include(CheckIncludeFileCXX)
+
+#set(CMAKE_WARN_DEPRECATED YES)
+set(CMAKE_WARN_UNUSED_CLI YES)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+
+# Add path to modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+ set(LLAMA_STANDALONE ON)
+
+ include(git-vars)
+
+ # configure project version
+ # TODO
+else()
+ set(LLAMA_STANDALONE OFF)
+endif()
+
+option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
+
+option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
+
+if (EMSCRIPTEN)
+ set(BUILD_SHARED_LIBS_DEFAULT OFF)
+
+ # Use 64-bit memory to support backend_get_memory queries
+ # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
+ if (LLAMA_WASM_MEM64)
+ add_compile_options("-sMEMORY64=1")
+ add_link_options("-sMEMORY64=1")
+ endif()
+ add_link_options("-sALLOW_MEMORY_GROWTH=1")
+
+ option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
+ option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
+ if (LLAMA_BUILD_HTML)
+ set(CMAKE_EXECUTABLE_SUFFIX ".html")
+ endif()
+else()
+ if (MINGW)
+ set(BUILD_SHARED_LIBS_DEFAULT OFF)
+ else()
+ set(BUILD_SHARED_LIBS_DEFAULT ON)
+ endif()
+endif()
+
+option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+
+if (WIN32)
+ add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
+if (MSVC)
+ add_compile_options("$<$:/utf-8>")
+ add_compile_options("$<$:/utf-8>")
+ add_compile_options("$<$:/bigobj>")
+ add_compile_options("$<$:/bigobj>")
+endif()
+
+if (LLAMA_STANDALONE)
+ # enable parallel builds for msbuild
+ list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+ list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
+
+if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+ set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
+else()
+ set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
+endif()
+
+#
+# option list
+#
+
+# debug
+option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
+
+# build
+option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
+
+# sanitizers
+option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
+option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
+option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
+
+# utils
+option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
+
+# extra artifacts
+option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
+option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
+
+# 3rd party libs
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_HTTPLIB "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
+option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
+option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
+
+# Required for relocatable CMake package
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
+
+if (NOT DEFINED LLAMA_BUILD_NUMBER)
+ set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
+endif()
+if (NOT DEFINED LLAMA_BUILD_COMMIT)
+ set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
+endif()
+set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
+
+# override ggml options
+set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
+
+# change the default for these ggml options
+if (NOT DEFINED GGML_LLAMAFILE)
+ set(GGML_LLAMAFILE_DEFAULT ON)
+endif()
+
+if (NOT DEFINED GGML_CUDA_GRAPHS)
+ set(GGML_CUDA_GRAPHS_DEFAULT ON)
+endif()
+
+# transition helpers
+function (llama_option_depr TYPE OLD NEW)
+ if (${OLD})
+ message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
+ set(${NEW} ON PARENT_SCOPE)
+ endif()
+endfunction()
+
+llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
+llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
+llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
+llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
+llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
+llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
+llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
+llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
+llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
+
+if (NOT MSVC)
+ if (LLAMA_SANITIZE_THREAD)
+ message(STATUS "Using -fsanitize=thread")
+
+ add_compile_options(-fsanitize=thread)
+ link_libraries (-fsanitize=thread)
+ endif()
+
+ if (LLAMA_SANITIZE_ADDRESS)
+ message(STATUS "Using -fsanitize=address")
+
+ add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+ link_libraries (-fsanitize=address)
+ endif()
+
+ if (LLAMA_SANITIZE_UNDEFINED)
+ message(STATUS "Using -fsanitize=undefined")
+
+ add_compile_options(-fsanitize=undefined)
+ link_libraries (-fsanitize=undefined)
+ endif()
+endif()
+
+include("cmake/license.cmake")
+license_add_file("llama.cpp" "LICENSE")
+
+#
+# 3rd-party
+#
+
+if (LLAMA_USE_SYSTEM_GGML)
+ message(STATUS "Using system-provided libggml, skipping ggml build")
+ find_package(ggml REQUIRED)
+ add_library(ggml ALIAS ggml::ggml)
+endif()
+
+if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
+ set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
+ set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
+ add_subdirectory(ggml)
+ # ... otherwise assume ggml is added by a parent CMakeLists.txt
+endif()
+
+#
+# build the library
+#
+
+add_subdirectory(src)
+
+#
+# utils, programs, examples and tests
+#
+
+if (NOT LLAMA_BUILD_COMMON)
+ message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
+ set(LLAMA_CURL OFF)
+endif()
+
+if (LLAMA_BUILD_COMMON)
+ add_subdirectory(common)
+ if (LLAMA_HTTPLIB)
+ add_subdirectory(vendor/cpp-httplib)
+ endif()
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+ include(CTest)
+ add_subdirectory(tests)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+ add_subdirectory(examples)
+ add_subdirectory(pocs)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
+ add_subdirectory(tools)
+endif()
+
+# Automatically add all files from the 'licenses' directory
+file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
+
+foreach(FILE_PATH ${EXTRA_LICENSES})
+ get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
+ string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
+ license_add_file("${NAME}" "${FILE_PATH}")
+endforeach()
+
+if (LLAMA_BUILD_COMMON)
+ license_generate(common)
+endif()
+
+#
+# install
+#
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
+set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
+set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
+
+set(LLAMA_PUBLIC_HEADERS
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
+
+set_target_properties(llama
+ PROPERTIES
+ PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
+
+install(TARGETS llama LIBRARY PUBLIC_HEADER)
+
+configure_package_config_file(
+ ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
+ ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
+ PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
+ LLAMA_LIB_INSTALL_DIR
+ LLAMA_BIN_INSTALL_DIR )
+
+write_basic_package_version_file(
+ ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
+ VERSION ${LLAMA_INSTALL_VERSION}
+ COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+ ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
+
+install(
+ FILES convert_hf_to_gguf.py
+ PERMISSIONS
+ OWNER_READ
+ OWNER_WRITE
+ OWNER_EXECUTE
+ GROUP_READ
+ GROUP_EXECUTE
+ WORLD_READ
+ WORLD_EXECUTE
+ DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+configure_file(cmake/llama.pc.in
+ "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+ @ONLY)
+
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/arm64-apple-clang.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/arm64-apple-clang.cmake
new file mode 100644
index 0000000..5fcd288
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/arm64-apple-clang.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME Darwin )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-apple-darwin-macho )
+
+set( CMAKE_C_COMPILER clang )
+set( CMAKE_CXX_COMPILER clang++ )
+
+set( CMAKE_C_COMPILER_TARGET ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
+
+set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/arm64-windows-llvm.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/arm64-windows-llvm.cmake
new file mode 100644
index 0000000..8023796
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/arm64-windows-llvm.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target arm64-pc-windows-msvc )
+
+set( CMAKE_C_COMPILER clang )
+set( CMAKE_CXX_COMPILER clang++ )
+
+set( CMAKE_C_COMPILER_TARGET ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+
+set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/build-info.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/build-info.cmake
new file mode 100644
index 0000000..c700595
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/build-info.cmake
@@ -0,0 +1,48 @@
+set(BUILD_NUMBER 0)
+set(BUILD_COMMIT "unknown")
+set(BUILD_COMPILER "unknown")
+set(BUILD_TARGET "unknown")
+
+# Look for git
+find_package(Git)
+if(NOT Git_FOUND)
+ find_program(GIT_EXECUTABLE NAMES git git.exe)
+ if(GIT_EXECUTABLE)
+ set(Git_FOUND TRUE)
+ message(STATUS "Found Git: ${GIT_EXECUTABLE}")
+ else()
+ message(WARNING "Git not found. Build info will not be accurate.")
+ endif()
+endif()
+
+# Get the commit count and hash
+if(Git_FOUND)
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ OUTPUT_VARIABLE HEAD
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ RESULT_VARIABLE RES
+ )
+ if (RES EQUAL 0)
+ set(BUILD_COMMIT ${HEAD})
+ endif()
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ OUTPUT_VARIABLE COUNT
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ RESULT_VARIABLE RES
+ )
+ if (RES EQUAL 0)
+ set(BUILD_NUMBER ${COUNT})
+ endif()
+endif()
+
+set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+
+if(CMAKE_VS_PLATFORM_NAME)
+ set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+ set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/common.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/common.cmake
new file mode 100644
index 0000000..a5bb787
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/common.cmake
@@ -0,0 +1,35 @@
+include("ggml/cmake/common.cmake")
+
+function(llama_add_compile_flags)
+ if (LLAMA_FATAL_WARNINGS)
+ if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ list(APPEND C_FLAGS -Werror)
+ list(APPEND CXX_FLAGS -Werror)
+ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+ add_compile_options(/WX)
+ endif()
+ endif()
+
+ if (LLAMA_ALL_WARNINGS)
+ if (NOT MSVC)
+ list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+ -Werror=implicit-int -Werror=implicit-function-declaration)
+
+ list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+
+ list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+
+ list(APPEND C_FLAGS ${WARNING_FLAGS})
+ list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+ ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+ add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>"
+ "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+ else()
+ # todo : msvc
+ set(C_FLAGS "" PARENT_SCOPE)
+ set(CXX_FLAGS "" PARENT_SCOPE)
+ endif()
+ endif()
+endfunction()
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/git-vars.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/git-vars.cmake
new file mode 100644
index 0000000..1a4c24e
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/git-vars.cmake
@@ -0,0 +1,22 @@
+find_package(Git)
+
+# the commit's SHA1
+execute_process(COMMAND
+ "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+ OUTPUT_VARIABLE GIT_SHA1
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the date of the commit
+execute_process(COMMAND
+ "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+ OUTPUT_VARIABLE GIT_DATE
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the subject of the commit
+execute_process(COMMAND
+ "${GIT_EXECUTABLE}" log -1 --format=%s
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+ OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/license.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/license.cmake
new file mode 100644
index 0000000..de06660
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/license.cmake
@@ -0,0 +1,40 @@
+define_property(GLOBAL PROPERTY LICENSE_TEXT
+ BRIEF_DOCS "Embedded licenses"
+ FULL_DOCS "Global string containing all aggregated licenses"
+)
+
+function(license_add_file NAME FILE)
+ if(NOT IS_ABSOLUTE "${FILE}")
+ set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
+ endif()
+ if(EXISTS "${FILE}")
+ set(TITLE "License for ${NAME}")
+ string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
+ file(READ "${FILE}" TEXT)
+ get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
+ string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
+ set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
+ else()
+ message(WARNING "License file '${FILE}' not found")
+ endif()
+endfunction()
+
+function(license_generate TARGET_NAME)
+ message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
+ get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
+
+ set(CPP_CONTENT "// Generated by CMake\n\n")
+ string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
+ string(APPEND CPP_CONTENT "${TEXT}")
+ string(APPEND CPP_CONTENT "nullptr\n")
+ string(APPEND CPP_CONTENT "};\n")
+
+ set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
+ file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
+
+ if(TARGET ${TARGET_NAME})
+ target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
+ else()
+ message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
+ endif()
+endfunction()
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/llama-config.cmake.in b/patches/llama-cpp-sys-2/llama.cpp/cmake/llama-config.cmake.in
new file mode 100644
index 0000000..90cbec5
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/llama-config.cmake.in
@@ -0,0 +1,30 @@
+set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
+set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
+set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
+set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
+
+@PACKAGE_INIT@
+
+set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
+set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+
+find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
+
+find_library(llama_LIBRARY llama
+ REQUIRED
+ HINTS ${LLAMA_LIB_DIR}
+ NO_CMAKE_FIND_ROOT_PATH
+)
+
+add_library(llama UNKNOWN IMPORTED)
+set_target_properties(llama
+ PROPERTIES
+ INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
+ INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
+ IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+ IMPORTED_LOCATION "${llama_LIBRARY}"
+ INTERFACE_COMPILE_FEATURES c_std_90
+ POSITION_INDEPENDENT_CODE ON)
+
+check_required_components(Llama)
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/llama.pc.in b/patches/llama-cpp-sys-2/llama.cpp/cmake/llama.pc.in
new file mode 100644
index 0000000..6fb58b5
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/llama.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: llama
+Description: Port of Facebook's LLaMA model in C/C++
+Version: @LLAMA_INSTALL_VERSION@
+Libs: -L${libdir} -lggml -lggml-base -lllama
+Cflags: -I${includedir}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
new file mode 100644
index 0000000..08fdbf5
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+set(CMAKE_SYSTEM_VERSION 1)
+
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
+ message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+else()
+ set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
+ if (DEFINED ENV{RISCV_ROOT_PATH})
+ file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+ else()
+ message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+ endif()
+
+ set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
+ set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
+ set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
+ set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
+ set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
+ set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
+endif()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
diff --git a/patches/llama-cpp-sys-2/llama.cpp/cmake/x64-windows-llvm.cmake b/patches/llama-cpp-sys-2/llama.cpp/cmake/x64-windows-llvm.cmake
new file mode 100644
index 0000000..77e7914
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/cmake/x64-windows-llvm.cmake
@@ -0,0 +1,5 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR x86_64 )
+
+set( CMAKE_C_COMPILER clang )
+set( CMAKE_CXX_COMPILER clang++ )
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/CMakeLists.txt b/patches/llama-cpp-sys-2/llama.cpp/common/CMakeLists.txt
new file mode 100644
index 0000000..55222bd
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/CMakeLists.txt
@@ -0,0 +1,157 @@
+# common
+
+find_package(Threads REQUIRED)
+
+llama_add_compile_flags()
+
+# Build info header
+#
+
+if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
+
+ # Is git submodule
+ if(NOT IS_DIRECTORY "${GIT_DIR}")
+ file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
+ string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
+ string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
+ if (SLASH_POS EQUAL 0)
+ set(GIT_DIR "${REAL_GIT_DIR}")
+ else()
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
+ endif()
+ endif()
+
+ if(EXISTS "${GIT_DIR}/index")
+ # For build-info.cpp below
+ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
+ else()
+ message(WARNING "Git index not found in git repository.")
+ endif()
+else()
+ message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+endif()
+
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
+set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
+configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+
+set(TARGET build_info)
+add_library(${TARGET} OBJECT ${OUTPUT_FILE})
+if (BUILD_SHARED_LIBS)
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+set(TARGET common)
+
+add_library(${TARGET} STATIC
+ arg.cpp
+ arg.h
+ base64.hpp
+ chat-parser.cpp
+ chat-parser.h
+ chat-parser-xml-toolcall.h
+ chat-parser-xml-toolcall.cpp
+ chat-peg-parser.cpp
+ chat-peg-parser.h
+ chat.cpp
+ chat.h
+ common.cpp
+ common.h
+ console.cpp
+ console.h
+ download.cpp
+ download.h
+ http.h
+ json-partial.cpp
+ json-partial.h
+ json-schema-to-grammar.cpp
+ llguidance.cpp
+ log.cpp
+ log.h
+ ngram-cache.cpp
+ ngram-cache.h
+ peg-parser.cpp
+ peg-parser.h
+ preset.cpp
+ preset.h
+ regex-partial.cpp
+ regex-partial.h
+ sampling.cpp
+ sampling.h
+ speculative.cpp
+ speculative.h
+ unicode.cpp
+ unicode.h
+ )
+
+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features (${TARGET} PUBLIC cxx_std_17)
+
+if (BUILD_SHARED_LIBS)
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
+set(LLAMA_COMMON_EXTRA_LIBS build_info)
+
+if (LLAMA_CURL)
+ # Use curl to download model url
+ find_package(CURL)
+ if (NOT CURL_FOUND)
+ message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
+ endif()
+ target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+ include_directories(${CURL_INCLUDE_DIRS})
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
+elseif (LLAMA_HTTPLIB)
+ # otherwise, use cpp-httplib
+ target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
+endif()
+
+if (LLAMA_LLGUIDANCE)
+ include(ExternalProject)
+ set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
+ set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
+
+ # Set the correct library file extension based on platform
+ if (WIN32)
+ set(LLGUIDANCE_LIB_NAME "llguidance.lib")
+ # Add Windows-specific libraries
+ set(LLGUIDANCE_PLATFORM_LIBS
+ ws2_32 # Windows Sockets API
+ userenv # For GetUserProfileDirectoryW
+ ntdll # For NT functions
+ bcrypt # For BCryptGenRandom
+ )
+ else()
+ set(LLGUIDANCE_LIB_NAME "libllguidance.a")
+ set(LLGUIDANCE_PLATFORM_LIBS "")
+ endif()
+
+ ExternalProject_Add(llguidance_ext
+ GIT_REPOSITORY https://github.com/guidance-ai/llguidance
+ # v1.0.1:
+ GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
+ PREFIX ${CMAKE_BINARY_DIR}/llguidance
+ SOURCE_DIR ${LLGUIDANCE_SRC}
+ BUILD_IN_SOURCE TRUE
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND cargo build --release --package llguidance
+ INSTALL_COMMAND ""
+ BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
+ UPDATE_COMMAND ""
+ )
+ target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
+
+ add_library(llguidance STATIC IMPORTED)
+ set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
+ add_dependencies(llguidance llguidance_ext)
+
+ target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
+ # Add platform libraries to the main target
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
+endif ()
+
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/arg.cpp b/patches/llama-cpp-sys-2/llama.cpp/common/arg.cpp
new file mode 100644
index 0000000..ec0a2f0
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/arg.cpp
@@ -0,0 +1,3716 @@
+#include "arg.h"
+
+#include "chat.h"
+#include "common.h"
+#include "download.h"
+#include "json-schema-to-grammar.h"
+#include "log.h"
+#include "sampling.h"
+#include "preset.h"
+
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include
+#endif
+
+#define JSON_ASSERT GGML_ASSERT
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include // for hardware_concurrency
+#include
+
+#ifndef __EMSCRIPTEN__
+#ifdef __linux__
+#include
+#elif defined(_WIN32)
+# if !defined(PATH_MAX)
+# define PATH_MAX MAX_PATH
+# endif
+#elif defined(_AIX)
+#include
+#else
+#include
+#endif
+#endif
+
+#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+extern const char * LICENSES[];
+
+using json = nlohmann::ordered_json;
+using namespace common_arg_utils;
+
+static std::initializer_list mmproj_examples = {
+ LLAMA_EXAMPLE_MTMD,
+ LLAMA_EXAMPLE_SERVER,
+ LLAMA_EXAMPLE_CLI,
+};
+
+static std::string read_file(const std::string & fname) {
+ std::ifstream file(fname);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
+ }
+ std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator());
+ file.close();
+ return content;
+}
+
+static const std::vector & get_common_arg_defs() {
+ static const std::vector options = [] {
+ common_params params;
+ auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
+ return ctx.options;
+ }();
+ return options;
+}
+
+common_arg & common_arg::set_examples(std::initializer_list examples) {
+ this->examples = examples;
+ return *this;
+}
+
+common_arg & common_arg::set_excludes(std::initializer_list excludes) {
+ this->excludes = excludes;
+ return *this;
+}
+
+common_arg & common_arg::set_env(const char * env) {
+ help = help + "\n(env: " + env + ")";
+ this->env = env;
+ return *this;
+}
+
+common_arg & common_arg::set_sparam() {
+ is_sparam = true;
+ return *this;
+}
+
+common_arg & common_arg::set_preset_only() {
+ is_preset_only = true;
+ return *this;
+}
+
+bool common_arg::in_example(enum llama_example ex) {
+ return examples.find(ex) != examples.end();
+}
+
+bool common_arg::is_exclude(enum llama_example ex) {
+ return excludes.find(ex) != excludes.end();
+}
+
+bool common_arg::get_value_from_env(std::string & output) const {
+ if (env == nullptr) return false;
+ if (!args_neg.empty()) {
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ char * neg_value = std::getenv(neg_env.c_str());
+ if (neg_value) {
+ output = "0"; // falsey
+ return true;
+ }
+ }
+ char * value = std::getenv(env);
+ if (value) {
+ output = value;
+ return true;
+ }
+ return false;
+}
+
+bool common_arg::has_value_from_env() const {
+ if (env != nullptr && !args_neg.empty()) {
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ if (std::getenv(neg_env.c_str())) {
+ return true;
+ }
+ }
+ return env != nullptr && std::getenv(env);
+}
+
+static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) {
+ std::vector result;
+ std::istringstream iss(input);
+ std::string line;
+ auto add_line = [&](const std::string& l) {
+ if (l.length() <= max_char_per_line) {
+ result.push_back(l);
+ } else {
+ std::istringstream line_stream(l);
+ std::string word, current_line;
+ while (line_stream >> word) {
+ if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
+ if (!current_line.empty()) result.push_back(current_line);
+ current_line = word;
+ } else {
+ current_line += (!current_line.empty() ? " " : "") + word;
+ }
+ }
+ if (!current_line.empty()) result.push_back(current_line);
+ }
+ };
+ while (std::getline(iss, line)) {
+ add_line(line);
+ }
+ return result;
+}
+
+std::string common_arg::to_string() const {
+ // params for printing to console
+ const static int n_leading_spaces = 40;
+ const static int n_char_per_line_help = 70; // TODO: detect this based on current console
+ std::string leading_spaces(n_leading_spaces, ' ');
+
+ std::ostringstream ss;
+ auto all_args = get_args(); // also contains args_neg
+ for (const auto & arg : all_args) {
+ if (arg == all_args.front()) {
+ if (all_args.size() == 1) {
+ ss << arg;
+ } else {
+ // first arg is usually abbreviation, we need padding to make it more beautiful
+ auto tmp = std::string(arg) + ", ";
+ auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' ');
+ ss << tmp << spaces;
+ }
+ } else {
+ ss << arg << (arg != all_args.back() ? ", " : "");
+ }
+ }
+ if (value_hint) ss << " " << value_hint;
+ if (value_hint_2) ss << " " << value_hint_2;
+ if (ss.tellp() > n_leading_spaces - 3) {
+ // current line is too long, add new line
+ ss << "\n" << leading_spaces;
+ } else {
+ // padding between arg and help, same line
+ ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+ }
+ const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
+ for (const auto & line : help_lines) {
+ ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+ }
+ return ss.str();
+}
+
+std::vector common_arg::get_args() const {
+ std::vector result;
+ for (const auto & arg : args) {
+ result.push_back(std::string(arg));
+ }
+ for (const auto & arg : args_neg) {
+ result.push_back(std::string(arg));
+ }
+ return result;
+}
+
+std::vector common_arg::get_env() const {
+ std::vector result;
+ if (env) {
+ result.push_back(std::string(env));
+ }
+ if (!args_neg.empty() && env) {
+ // for compatibility, we need to add LLAMA_ARG_NO_ variant
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ result.push_back(neg_env);
+ }
+ return result;
+}
+
+//
+// utils
+//
+
+// Helper function to parse tensor buffer override strings
+static void parse_tensor_buffer_overrides(const std::string & value, std::vector & overrides) {
+ std::map buft_list;
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ auto * dev = ggml_backend_dev_get(i);
+ auto * buft = ggml_backend_dev_buffer_type(dev);
+ if (buft) {
+ buft_list[ggml_backend_buft_name(buft)] = buft;
+ }
+ }
+
+ for (const auto & override : string_split(value, ',')) {
+ std::string::size_type pos = override.find('=');
+ if (pos == std::string::npos) {
+ throw std::invalid_argument("invalid value");
+ }
+ std::string tensor_name = override.substr(0, pos);
+ std::string buffer_type = override.substr(pos + 1);
+
+ if (buft_list.find(buffer_type) == buft_list.end()) {
+ printf("Available buffer types:\n");
+ for (const auto & it : buft_list) {
+ printf(" %s\n", ggml_backend_buft_name(it.second));
+ }
+ throw std::invalid_argument("unknown buffer type");
+ }
+ // keep strings alive and avoid leaking memory by storing them in a static vector
+ static std::list buft_overrides;
+ buft_overrides.push_back(tensor_name);
+ overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
+ }
+}
+
+static std::string clean_file_name(const std::string & fname) {
+ std::string clean_fname = fname;
+ string_replace_all(clean_fname, "\\", "_");
+ string_replace_all(clean_fname, "/", "_");
+ return clean_fname;
+}
+
+static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
+ GGML_ASSERT(!params.model.hf_repo.empty());
+
+ // the returned hf_repo is without tag
+ auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
+
+ // "latest" tag (default if not specified) is translated to "default" preset
+ if (hf_tag == "latest") {
+ hf_tag = "default";
+ }
+
+ const bool offline = params.offline;
+ std::string model_endpoint = get_model_endpoint();
+ auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
+
+ // prepare local path for caching
+ auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
+ auto preset_path = fs_get_cache_file(preset_fname);
+ const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+ const bool has_preset = status >= 200 && status < 400;
+
+ // remote preset is optional, so we don't error out if not found
+ if (has_preset) {
+ LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+ common_preset_context ctx(ex, /* only_remote_allowed */ true);
+ common_preset global;
+ auto remote_presets = ctx.load_from_ini(preset_path, global);
+ remote_presets = ctx.cascade(global, remote_presets);
+ if (remote_presets.find(hf_tag) != remote_presets.end()) {
+ common_preset preset = remote_presets.at(hf_tag);
+ LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
+ preset.apply_to_params(params);
+ } else {
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
+ }
+ } else {
+ LOG_INF("%s", "no remote preset found, skipping\n");
+ }
+
+ return has_preset;
+}
+
+struct handle_model_result {
+ bool found_mmproj = false;
+ common_params_model mmproj;
+};
+
+static handle_model_result common_params_handle_model(
+ struct common_params_model & model,
+ const std::string & bearer_token,
+ bool offline) {
+ handle_model_result result;
+ // handle pre-fill default model path and url based on hf_repo and hf_file
+ {
+ if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
+ model.path = common_docker_resolve_model(model.docker_repo);
+ model.name = model.docker_repo; // set name for consistency
+ } else if (!model.hf_repo.empty()) {
+ // short-hand to avoid specifying --hf-file -> default it to --model
+ if (model.hf_file.empty()) {
+ if (model.path.empty()) {
+ auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
+ if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
+ exit(1); // built without CURL, error message already printed
+ }
+ model.name = model.hf_repo; // repo name with tag
+ model.hf_repo = auto_detected.repo; // repo name without tag
+ model.hf_file = auto_detected.ggufFile;
+ if (!auto_detected.mmprojFile.empty()) {
+ result.found_mmproj = true;
+ result.mmproj.hf_repo = model.hf_repo;
+ result.mmproj.hf_file = auto_detected.mmprojFile;
+ }
+ } else {
+ model.hf_file = model.path;
+ }
+ }
+
+ std::string model_endpoint = get_model_endpoint();
+ model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
+ // make sure model path is present (for caching purposes)
+ if (model.path.empty()) {
+ // this is to avoid different repo having same file name, or same file name in different subdirs
+ std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
+ model.path = fs_get_cache_file(filename);
+ }
+
+ } else if (!model.url.empty()) {
+ if (model.path.empty()) {
+ auto f = string_split(model.url, '#').front();
+ f = string_split(f, '?').front();
+ model.path = fs_get_cache_file(string_split(f, '/').back());
+ }
+
+ }
+ }
+
+ // then, download it if needed
+ if (!model.url.empty()) {
+ bool ok = common_download_model(model, bearer_token, offline);
+ if (!ok) {
+ LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
+ exit(1);
+ }
+ }
+
+ return result;
+}
+
+const std::vector kv_cache_types = {
+ GGML_TYPE_F32,
+ GGML_TYPE_F16,
+ GGML_TYPE_BF16,
+ GGML_TYPE_Q8_0,
+ GGML_TYPE_Q4_0,
+ GGML_TYPE_Q4_1,
+ GGML_TYPE_IQ4_NL,
+ GGML_TYPE_Q5_0,
+ GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+ for (const auto & type : kv_cache_types) {
+ if (ggml_type_name(type) == s) {
+ return type;
+ }
+ }
+ throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+ std::ostringstream msg;
+ for (const auto & type : kv_cache_types) {
+ msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+ }
+ return msg.str();
+}
+
+static bool parse_bool_value(const std::string & value) {
+ if (is_truthy(value)) {
+ return true;
+ } else if (is_falsey(value)) {
+ return false;
+ } else {
+ throw std::invalid_argument("invalid boolean value");
+ }
+}
+
+//
+// CLI argument parsing functions
+//
+
+static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
+ common_params & params = ctx_arg.params;
+
+ std::unordered_map> arg_to_options;
+ for (auto & opt : ctx_arg.options) {
+ for (const auto & arg : opt.args) {
+ arg_to_options[arg] = {&opt, /* is_positive */ true};
+ }
+ for (const auto & arg : opt.args_neg) {
+ arg_to_options[arg] = {&opt, /* is_positive */ false};
+ }
+ }
+
+ // handle environment variables
+ for (auto & opt : ctx_arg.options) {
+ std::string value;
+ if (opt.get_value_from_env(value)) {
+ try {
+ if (opt.handler_void && is_truthy(value)) {
+ opt.handler_void(params);
+ }
+ if (opt.handler_int) {
+ opt.handler_int(params, std::stoi(value));
+ }
+ if (opt.handler_bool) {
+ opt.handler_bool(params, parse_bool_value(value));
+ }
+ if (opt.handler_string) {
+ opt.handler_string(params, value);
+ continue;
+ }
+ } catch (std::exception & e) {
+ throw std::invalid_argument(string_format(
+ "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
+ }
+ }
+ }
+
+ // handle command line arguments
+ auto check_arg = [&](int i) {
+ if (i+1 >= argc) {
+ throw std::invalid_argument("expected value for argument");
+ }
+ };
+
+ auto parse_cli_args = [&]() {
+ std::set seen_args;
+
+ for (int i = 1; i < argc; i++) {
+ const std::string arg_prefix = "--";
+
+ std::string arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
+ }
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+ }
+ if (!seen_args.insert(arg).second) {
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+ }
+ auto & tmp = arg_to_options[arg];
+ auto opt = *tmp.first;
+ bool is_positive = tmp.second;
+ if (opt.has_value_from_env()) {
+ fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+ }
+ try {
+ if (opt.handler_void) {
+ opt.handler_void(params);
+ continue;
+ }
+ if (opt.handler_bool) {
+ opt.handler_bool(params, is_positive);
+ continue;
+ }
+
+ // arg with single value
+ check_arg(i);
+ std::string val = argv[++i];
+ if (opt.handler_int) {
+ opt.handler_int(params, std::stoi(val));
+ continue;
+ }
+ if (opt.handler_string) {
+ opt.handler_string(params, val);
+ continue;
+ }
+
+ // arg with 2 values
+ check_arg(i);
+ std::string val2 = argv[++i];
+ if (opt.handler_str_str) {
+ opt.handler_str_str(params, val, val2);
+ continue;
+ }
+ } catch (std::exception & e) {
+ throw std::invalid_argument(string_format(
+ "error while handling argument \"%s\": %s\n\n"
+ "usage:\n%s\n\nto show complete usage, run with -h",
+ arg.c_str(), e.what(), opt.to_string().c_str()));
+ }
+ }
+ };
+
+ // parse the first time to get -hf option (used for remote preset)
+ parse_cli_args();
+
+ // maybe handle remote preset
+ if (!params.model.hf_repo.empty()) {
+ std::string cli_hf_repo = params.model.hf_repo;
+ bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
+
+ // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
+ // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
+ std::string preset_hf_repo = params.model.hf_repo;
+ bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
+
+ if (has_preset) {
+ // re-parse CLI args to override preset values
+ parse_cli_args();
+ }
+
+ // preserve hf_repo from preset if needed
+ if (preset_has_hf_repo) {
+ params.model.hf_repo = preset_hf_repo;
+ }
+ }
+
+ postprocess_cpu_params(params.cpuparams, nullptr);
+ postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
+
+ postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams);
+ postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch);
+
+ if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
+ throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+ }
+
+ // handle model and download
+ {
+ auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
+ if (params.no_mmproj) {
+ params.mmproj = {};
+ } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+ // optionally, handle mmproj model when -hf is specified
+ params.mmproj = res.mmproj;
+ }
+ // only download mmproj if the current example is using it
+ for (auto & ex : mmproj_examples) {
+ if (ctx_arg.ex == ex) {
+ common_params_handle_model(params.mmproj, params.hf_token, params.offline);
+ break;
+ }
+ }
+ common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
+ common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
+ }
+
+ // model is required (except for server)
+ // TODO @ngxson : maybe show a list of available models in CLI in this case
+ if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
+ throw std::invalid_argument("error: --model is required\n");
+ }
+
+ if (params.escape) {
+ string_process_escapes(params.prompt);
+ string_process_escapes(params.input_prefix);
+ string_process_escapes(params.input_suffix);
+ for (auto & antiprompt : params.antiprompt) {
+ string_process_escapes(antiprompt);
+ }
+ for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
+ string_process_escapes(seq_breaker);
+ }
+ for (auto & pair : params.speculative.replacements) {
+ string_process_escapes(pair.first);
+ string_process_escapes(pair.second);
+ }
+ }
+
+ if (!params.kv_overrides.empty()) {
+ params.kv_overrides.emplace_back();
+ params.kv_overrides.back().key[0] = 0;
+ }
+
+ // pad tensor_buft_overrides for llama_params_fit:
+ const size_t ntbo = llama_max_tensor_buft_overrides();
+ while (params.tensor_buft_overrides.size() < ntbo) {
+ params.tensor_buft_overrides.push_back({nullptr, nullptr});
+ }
+
+ if (!params.speculative.tensor_buft_overrides.empty()) {
+ params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
+ }
+
+ if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
+ throw std::runtime_error(string_format(
+ "error: the supplied chat template is not supported: %s%s\n",
+ params.chat_template.c_str(),
+ params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
+ ));
+ }
+
+ common_log_set_verbosity_thold(params.verbosity);
+
+ return true;
+}
+
+static void common_params_print_usage(common_params_context & ctx_arg) {
+ auto print_options = [](std::vector & options) {
+ for (common_arg * opt : options) {
+ printf("%s", opt->to_string().c_str());
+ }
+ };
+
+ std::vector common_options;
+ std::vector sparam_options;
+ std::vector specific_options;
+ for (auto & opt : ctx_arg.options) {
+ // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+ if (opt.is_sparam) {
+ sparam_options.push_back(&opt);
+ } else if (opt.in_example(ctx_arg.ex)) {
+ specific_options.push_back(&opt);
+ } else {
+ common_options.push_back(&opt);
+ }
+ }
+ printf("----- common params -----\n\n");
+ print_options(common_options);
+ printf("\n\n----- sampling params -----\n\n");
+ print_options(sparam_options);
+ // TODO: maybe convert enum llama_example to string
+ printf("\n\n----- example-specific params -----\n\n");
+ print_options(specific_options);
+}
+
+static void common_params_print_completion(common_params_context & ctx_arg) {
+ std::vector common_options;
+ std::vector sparam_options;
+ std::vector specific_options;
+
+ for (auto & opt : ctx_arg.options) {
+ if (opt.is_sparam) {
+ sparam_options.push_back(&opt);
+ } else if (opt.in_example(ctx_arg.ex)) {
+ specific_options.push_back(&opt);
+ } else {
+ common_options.push_back(&opt);
+ }
+ }
+
+ printf("_llama_completions() {\n");
+ printf(" local cur prev opts\n");
+ printf(" COMPREPLY=()\n");
+ printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
+ printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
+
+ printf(" opts=\"");
+ auto print_options = [](const std::vector & options) {
+ for (const common_arg * opt : options) {
+ for (const char * arg : opt->args) {
+ printf("%s ", arg);
+ }
+ }
+ };
+
+ print_options(common_options);
+ print_options(sparam_options);
+ print_options(specific_options);
+ printf("\"\n\n");
+
+ printf(" case \"$prev\" in\n");
+ printf(" --model|-m)\n");
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
+ printf(" return 0\n");
+ printf(" ;;\n");
+ printf(" --grammar-file)\n");
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
+ printf(" return 0\n");
+ printf(" ;;\n");
+ printf(" --chat-template-file)\n");
+ printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
+ printf(" return 0\n");
+ printf(" ;;\n");
+ printf(" *)\n");
+ printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
+ printf(" return 0\n");
+ printf(" ;;\n");
+ printf(" esac\n");
+ printf("}\n\n");
+
+ std::set executables = {
+ "llama-batched",
+ "llama-batched-bench",
+ "llama-bench",
+ "llama-cli",
+ "llama-completion",
+ "llama-convert-llama2c-to-ggml",
+ "llama-cvector-generator",
+ "llama-embedding",
+ "llama-eval-callback",
+ "llama-export-lora",
+ "llama-gen-docs",
+ "llama-gguf",
+ "llama-gguf-hash",
+ "llama-gguf-split",
+ "llama-gritlm",
+ "llama-imatrix",
+ "llama-infill",
+ "llama-mtmd-cli",
+ "llama-llava-clip-quantize-cli",
+ "llama-lookahead",
+ "llama-lookup",
+ "llama-lookup-create",
+ "llama-lookup-merge",
+ "llama-lookup-stats",
+ "llama-parallel",
+ "llama-passkey",
+ "llama-perplexity",
+ "llama-q8dot",
+ "llama-quantize",
+ "llama-qwen2vl-cli",
+ "llama-retrieval",
+ "llama-save-load-state",
+ "llama-server",
+ "llama-simple",
+ "llama-simple-chat",
+ "llama-speculative",
+ "llama-speculative-simple",
+ "llama-tokenize",
+ "llama-tts",
+ "llama-vdot"
+ };
+
+ for (const auto& exe : executables) {
+ printf("complete -F _llama_completions %s\n", exe.c_str());
+ }
+}
+
+static std::vector parse_device_list(const std::string & value) {
+ std::vector devices;
+ auto dev_names = string_split(value, ',');
+ if (dev_names.empty()) {
+ throw std::invalid_argument("no devices specified");
+ }
+ if (dev_names.size() == 1 && dev_names[0] == "none") {
+ devices.push_back(nullptr);
+ } else {
+ for (const auto & device : dev_names) {
+ auto * dev = ggml_backend_dev_by_name(device.c_str());
+ if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+ throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
+ }
+ devices.push_back(dev);
+ }
+ devices.push_back(nullptr);
+ }
+ return devices;
+}
+
+static void add_rpc_devices(const std::string & servers) {
+ auto rpc_servers = string_split(servers, ',');
+ if (rpc_servers.empty()) {
+ throw std::invalid_argument("no RPC servers specified");
+ }
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+ if (!rpc_reg) {
+ throw std::invalid_argument("failed to find RPC backend");
+ }
+ typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
+ ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
+ if (!ggml_backend_rpc_add_server_fn) {
+ throw std::invalid_argument("failed to find RPC add server function");
+ }
+ for (const auto & server : rpc_servers) {
+ auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
+ ggml_backend_register(reg);
+ }
+}
+
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & out_map) {
+ common_params dummy_params;
+ common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
+
+ std::unordered_map arg_to_options;
+ for (auto & opt : ctx_arg.options) {
+ for (const auto & arg : opt.args) {
+ arg_to_options[arg] = &opt;
+ }
+ for (const auto & arg : opt.args_neg) {
+ arg_to_options[arg] = &opt;
+ }
+ }
+
+ // TODO @ngxson : find a way to deduplicate this code
+
+ // handle command line arguments
+ auto check_arg = [&](int i) {
+ if (i+1 >= argc) {
+ throw std::invalid_argument("expected value for argument");
+ }
+ };
+
+ std::set seen_args;
+
+ for (int i = 1; i < argc; i++) {
+ const std::string arg_prefix = "--";
+
+ std::string arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
+ }
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+ }
+ if (!seen_args.insert(arg).second) {
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+ }
+ auto opt = *arg_to_options[arg];
+ std::string val;
+ if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+ // bool arg (need to reverse the meaning for negative args)
+ bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
+ val = is_neg ? "0" : "1";
+ }
+ if (opt.value_hint != nullptr) {
+ // arg with single value
+ check_arg(i);
+ val = argv[++i];
+ }
+ if (opt.value_hint_2 != nullptr) {
+ // TODO: support arg with 2 values
+ throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
+ }
+ out_map[opt] = val;
+ }
+
+ return true;
+}
+
+bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+ auto ctx_arg = common_params_parser_init(params, ex, print_usage);
+ const common_params params_org = ctx_arg.params; // the example can modify the default params
+
+ try {
+ if (!common_params_parse_ex(argc, argv, ctx_arg)) {
+ ctx_arg.params = params_org;
+ return false;
+ }
+ if (ctx_arg.params.usage) {
+ common_params_print_usage(ctx_arg);
+ if (ctx_arg.print_usage) {
+ ctx_arg.print_usage(argc, argv);
+ }
+ exit(0);
+ }
+ if (ctx_arg.params.completion) {
+ common_params_print_completion(ctx_arg);
+ exit(0);
+ }
+ params.lr.init();
+ } catch (const std::invalid_argument & ex) {
+ fprintf(stderr, "%s\n", ex.what());
+ ctx_arg.params = params_org;
+ return false;
+ } catch (std::exception & ex) {
+ fprintf(stderr, "%s\n", ex.what());
+ exit(1); // for other exceptions, we exit with status code 1
+ }
+
+ return true;
+}
+
+static std::string list_builtin_chat_templates() {
+ std::vector supported_tmpl;
+ int32_t res = llama_chat_builtin_templates(nullptr, 0);
+ supported_tmpl.resize(res);
+ res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
+ std::ostringstream msg;
+ for (auto & tmpl : supported_tmpl) {
+ msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
+ }
+ return msg.str();
+}
+
+bool common_arg_utils::is_truthy(const std::string & value) {
+ return value == "on" || value == "enabled" || value == "true" || value == "1";
+}
+
+bool common_arg_utils::is_falsey(const std::string & value) {
+ return value == "off" || value == "disabled" || value == "false" || value == "0";
+}
+
+bool common_arg_utils::is_autoy(const std::string & value) {
+ return value == "auto" || value == "-1";
+}
+
+// Simple CSV parser that handles quoted fields and escaped quotes
+// example:
+// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
+// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
+static std::vector parse_csv_row(const std::string& input) {
+ std::vector fields;
+ std::string field;
+ bool in_quotes = false;
+
+ for (size_t i = 0; i < input.length(); ++i) {
+ char ch = input[i];
+
+ if (ch == '"') {
+ if (!in_quotes) {
+ // start of quoted field (only valid if at beginning of field)
+ if (!field.empty()) {
+ // quote appeared in middle of unquoted field, treat as literal
+ field += '"';
+ } else {
+ in_quotes = true; // start
+ }
+ } else {
+ if (i + 1 < input.length() && input[i + 1] == '"') {
+ // escaped quote: ""
+ field += '"';
+ ++i; // skip the next quote
+ } else {
+ in_quotes = false; // end
+ }
+ }
+ } else if (ch == ',') {
+ if (in_quotes) {
+ field += ',';
+ } else {
+ fields.push_back(std::move(field));
+ field.clear();
+ }
+ } else {
+ field += ch;
+ }
+ }
+
+ // Add the last field
+ fields.push_back(std::move(field));
+
+ return fields;
+}
+
+common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+ // per-example default params
+ // we define here to make sure it's included in llama-gen-docs
+ if (ex == LLAMA_EXAMPLE_COMPLETION) {
+ params.use_jinja = false; // disable jinja by default
+
+ } else if (ex == LLAMA_EXAMPLE_MTMD) {
+ params.use_jinja = false; // disable jinja by default
+ params.sampling.temp = 0.2; // lower temp by default for better quality
+
+ } else if (ex == LLAMA_EXAMPLE_SERVER) {
+ params.n_parallel = -1; // auto by default
+ }
+
+ params.use_color = tty_can_use_colors();
+
+ // load dynamic backends
+ ggml_backend_load_all();
+
+ common_params_context ctx_arg(params);
+ ctx_arg.print_usage = print_usage;
+ ctx_arg.ex = ex;
+
+ std::string sampler_type_chars;
+ std::string sampler_type_names;
+ for (const auto & sampler : params.sampling.samplers) {
+ sampler_type_chars += common_sampler_type_to_chr(sampler);
+ sampler_type_names += common_sampler_type_to_str(sampler) + ";";
+ }
+ if (!sampler_type_names.empty()) {
+ sampler_type_names.pop_back(); // remove last semicolon
+ }
+
+
+ /**
+ * filter options by example
+ * rules:
+ * - all examples inherit options from LLAMA_EXAMPLE_COMMON
+ * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
+ * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
+ */
+ auto add_opt = [&](common_arg arg) {
+ if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
+ ctx_arg.options.push_back(std::move(arg));
+ }
+ };
+
+
+ add_opt(common_arg(
+ {"-h", "--help", "--usage"},
+ "print usage and exit",
+ [](common_params & params) {
+ params.usage = true;
+ }
+ ));
+ add_opt(common_arg(
+ {"--version"},
+ "show version and build info",
+ [](common_params &) {
+ fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+ fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+ exit(0);
+ }
+ ));
+ add_opt(common_arg(
+ {"--license"},
+ "show source code license and dependencies",
+ [](common_params &) {
+ for (int i = 0; LICENSES[i]; ++i) {
+ printf("%s\n", LICENSES[i]);
+ }
+ exit(0);
+ }
+ ));
+ add_opt(common_arg(
+ {"-cl", "--cache-list"},
+ "show list of models in cache",
+ [](common_params &) {
+ printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
+ auto models = common_list_cached_models();
+ printf("number of models in cache: %zu\n", models.size());
+ for (size_t i = 0; i < models.size(); i++) {
+ auto & model = models[i];
+ printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
+ }
+ exit(0);
+ }
+ ));
+ add_opt(common_arg(
+ {"--completion-bash"},
+ "print source-able bash completion script for llama.cpp",
+ [](common_params & params) {
+ params.completion = true;
+ }
+ ));
+ add_opt(common_arg(
+ {"--verbose-prompt"},
+ string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
+ [](common_params & params) {
+ params.verbose_prompt = true;
+ }
+ ));
+ add_opt(common_arg(
+ {"--display-prompt"},
+ {"--no-display-prompt"},
+ string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.display_prompt = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"-co", "--color"}, "[on|off|auto]",
+ "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
+ "'auto' enables colors when output is to a terminal",
+ [](common_params & params, const std::string & value) {
+ if (is_truthy(value)) {
+ params.use_color = true;
+ } else if (is_falsey(value)) {
+ params.use_color = false;
+ } else if (is_autoy(value)) {
+ params.use_color = tty_can_use_colors();
+ } else {
+ throw std::invalid_argument(
+ string_format("error: unknown value for --color: '%s'\n", value.c_str()));
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+ add_opt(common_arg(
+ {"-t", "--threads"}, "N",
+ string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
+ [](common_params & params, int value) {
+ params.cpuparams.n_threads = value;
+ if (params.cpuparams.n_threads <= 0) {
+ params.cpuparams.n_threads = std::thread::hardware_concurrency();
+ }
+ }
+ ).set_env("LLAMA_ARG_THREADS"));
+ add_opt(common_arg(
+ {"-tb", "--threads-batch"}, "N",
+ "number of threads to use during batch and prompt processing (default: same as --threads)",
+ [](common_params & params, int value) {
+ params.cpuparams_batch.n_threads = value;
+ if (params.cpuparams_batch.n_threads <= 0) {
+ params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"-C", "--cpu-mask"}, "M",
+ "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
+ [](common_params & params, const std::string & mask) {
+ params.cpuparams.mask_valid = true;
+ if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
+ throw std::invalid_argument("invalid cpumask");
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"-Cr", "--cpu-range"}, "lo-hi",
+ "range of CPUs for affinity. Complements --cpu-mask",
+ [](common_params & params, const std::string & range) {
+ params.cpuparams.mask_valid = true;
+ if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
+ throw std::invalid_argument("invalid range");
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"--cpu-strict"}, "<0|1>",
+ string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
+ [](common_params & params, const std::string & value) {
+ params.cpuparams.strict_cpu = std::stoul(value);
+ }
+ ));
+ add_opt(common_arg(
+ {"--prio"}, "N",
+ string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
+ [](common_params & params, int prio) {
+ if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
+ throw std::invalid_argument("invalid value");
+ }
+ params.cpuparams.priority = (enum ggml_sched_priority) prio;
+ }
+ ));
+ add_opt(common_arg(
+ {"--poll"}, "<0...100>",
+ string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
+ [](common_params & params, const std::string & value) {
+ params.cpuparams.poll = std::stoul(value);
+ }
+ ));
+ add_opt(common_arg(
+ {"-Cb", "--cpu-mask-batch"}, "M",
+ "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
+ [](common_params & params, const std::string & mask) {
+ params.cpuparams_batch.mask_valid = true;
+ if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
+ throw std::invalid_argument("invalid cpumask");
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"-Crb", "--cpu-range-batch"}, "lo-hi",
+ "ranges of CPUs for affinity. Complements --cpu-mask-batch",
+ [](common_params & params, const std::string & range) {
+ params.cpuparams_batch.mask_valid = true;
+ if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
+ throw std::invalid_argument("invalid range");
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"--cpu-strict-batch"}, "<0|1>",
+ "use strict CPU placement (default: same as --cpu-strict)",
+ [](common_params & params, int value) {
+ params.cpuparams_batch.strict_cpu = value;
+ }
+ ));
+ add_opt(common_arg(
+ {"--prio-batch"}, "N",
+ string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
+ [](common_params & params, int prio) {
+ if (prio < 0 || prio > 3) {
+ throw std::invalid_argument("invalid value");
+ }
+ params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+ }
+ ));
+ add_opt(common_arg(
+ {"--poll-batch"}, "<0|1>",
+ "use polling to wait for work (default: same as --poll)",
+ [](common_params & params, int value) {
+ params.cpuparams_batch.poll = value;
+ }
+ ));
+ add_opt(common_arg(
+ {"-lcs", "--lookup-cache-static"}, "FNAME",
+ "path to static lookup cache to use for lookup decoding (not updated by generation)",
+ [](common_params & params, const std::string & value) {
+ params.lookup_cache_static = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+ add_opt(common_arg(
+ {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
+ "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
+ [](common_params & params, const std::string & value) {
+ params.lookup_cache_dynamic = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+ add_opt(common_arg(
+ {"-c", "--ctx-size"}, "N",
+ string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
+ [](common_params & params, int value) {
+ params.n_ctx = value;
+ }
+ ).set_env("LLAMA_ARG_CTX_SIZE"));
+ add_opt(common_arg(
+ {"-n", "--predict", "--n-predict"}, "N",
+ string_format(
+ ex == LLAMA_EXAMPLE_COMPLETION
+ ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
+ : "number of tokens to predict (default: %d, -1 = infinity)",
+ params.n_predict),
+ [](common_params & params, int value) {
+ params.n_predict = value;
+ }
+ ).set_env("LLAMA_ARG_N_PREDICT"));
+ add_opt(common_arg(
+ {"-b", "--batch-size"}, "N",
+ string_format("logical maximum batch size (default: %d)", params.n_batch),
+ [](common_params & params, int value) {
+ params.n_batch = value;
+ }
+ ).set_env("LLAMA_ARG_BATCH"));
+ add_opt(common_arg(
+ {"-ub", "--ubatch-size"}, "N",
+ string_format("physical maximum batch size (default: %d)", params.n_ubatch),
+ [](common_params & params, int value) {
+ params.n_ubatch = value;
+ }
+ ).set_env("LLAMA_ARG_UBATCH"));
+ add_opt(common_arg(
+ {"--keep"}, "N",
+ string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
+ [](common_params & params, int value) {
+ params.n_keep = value;
+ }
+ ));
+ add_opt(common_arg(
+ {"--swa-full"},
+ string_format("use full-size SWA cache (default: %s)\n"
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
+ [](common_params & params) {
+ params.swa_full = true;
+ }
+ ).set_env("LLAMA_ARG_SWA_FULL"));
+ add_opt(common_arg(
+ {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
+ string_format("max number of context checkpoints to create per slot (default: %d)"
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
+ [](common_params & params, int value) {
+ params.n_ctx_checkpoints = value;
+ }
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"-cram", "--cache-ram"}, "N",
+ string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
+ [](common_params & params, int value) {
+ params.cache_ram_mib = value;
+ }
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"-kvu", "--kv-unified"},
+ "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
+ [](common_params & params) {
+ params.kv_unified = true;
+ }
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--context-shift"},
+ {"--no-context-shift"},
+ string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.ctx_shift = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
+ add_opt(common_arg(
+ {"--chunks"}, "N",
+ string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
+ [](common_params & params, int value) {
+ params.n_chunks = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+ add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
+ string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
+ llama_flash_attn_type_name(params.flash_attn_type)),
+ [](common_params & params, const std::string & value) {
+ if (is_truthy(value)) {
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
+ } else if (is_falsey(value)) {
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+ } else if (is_autoy(value)) {
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
+ } else {
+ throw std::runtime_error(
+ string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
+ }
+ }).set_env("LLAMA_ARG_FLASH_ATTN"));
+ add_opt(common_arg(
+ {"-p", "--prompt"}, "PROMPT",
+ "prompt to start generation with; for system message, use -sys",
+ [](common_params & params, const std::string & value) {
+ params.prompt = value;
+ }
+ ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"-sys", "--system-prompt"}, "PROMPT",
+ "system prompt to use with model (if applicable, depending on chat template)",
+ [](common_params & params, const std::string & value) {
+ params.system_prompt = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
+ add_opt(common_arg(
+ {"--perf"},
+ {"--no-perf"},
+ string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.no_perf = !value;
+ params.sampling.no_perf = !value;
+ }
+ ).set_env("LLAMA_ARG_PERF"));
+ add_opt(common_arg(
+ {"--show-timings"},
+ {"--no-show-timings"},
+ string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.show_timings = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
+ add_opt(common_arg(
+ {"-f", "--file"}, "FNAME",
+ "a file containing the prompt (default: none)",
+ [](common_params & params, const std::string & value) {
+ params.prompt = read_file(value);
+ // store the external file name in params
+ params.prompt_file = value;
+ if (!params.prompt.empty() && params.prompt.back() == '\n') {
+ params.prompt.pop_back();
+ }
+ }
+ ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"-sysf", "--system-prompt-file"}, "FNAME",
+ "a file containing the system prompt (default: none)",
+ [](common_params & params, const std::string & value) {
+ params.system_prompt = read_file(value);
+ if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
+ params.system_prompt.pop_back();
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
+ add_opt(common_arg(
+ {"--in-file"}, "FNAME",
+ "an input file (use comma-separated values to specify multiple files)",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ std::ifstream file(item);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+ }
+ params.in_files.push_back(item);
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"-bf", "--binary-file"}, "FNAME",
+ "binary file containing the prompt (default: none)",
+ [](common_params & params, const std::string & value) {
+ std::ifstream file(value, std::ios::binary);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+ }
+ // store the external file name in params
+ params.prompt_file = value;
+ std::ostringstream ss;
+ ss << file.rdbuf();
+ params.prompt = ss.str();
+ fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
+ }
+ ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"-e", "--escape"},
+ {"--no-escape"},
+ string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.escape = value;
+ }
+ ));
+ add_opt(common_arg(
+ {"-ptc", "--print-token-count"}, "N",
+ string_format("print token count every N tokens (default: %d)", params.n_print),
+ [](common_params & params, int value) {
+ params.n_print = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"--prompt-cache"}, "FNAME",
+ "file to cache prompt state for faster startup (default: none)",
+ [](common_params & params, const std::string & value) {
+ params.path_prompt_cache = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"--prompt-cache-all"},
+ "if specified, saves user input and generations to cache as well\n",
+ [](common_params & params) {
+ params.prompt_cache_all = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"--prompt-cache-ro"},
+ "if specified, uses the prompt cache but does not update it",
+ [](common_params & params) {
+ params.prompt_cache_ro = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"-r", "--reverse-prompt"}, "PROMPT",
+ "halt generation at PROMPT, return control in interactive mode\n",
+ [](common_params & params, const std::string & value) {
+ params.antiprompt.emplace_back(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"-sp", "--special"},
+ string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
+ [](common_params & params) {
+ params.special = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"-cnv", "--conversation"},
+ {"-no-cnv", "--no-conversation"},
+ "whether to run in conversation mode:\n"
+ "- does not print special tokens and suffix/prefix\n"
+ "- interactive mode is also enabled\n"
+ "(default: auto enabled if chat template is available)",
+ [](common_params & params, bool value) {
+ params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"-st", "--single-turn"},
+ "run conversation for a single turn only, then exit when done\n"
+ "will not be interactive if first turn is predefined with --prompt\n"
+ "(default: false)",
+ [](common_params & params) {
+ params.single_turn = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"-i", "--interactive"},
+ string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
+ [](common_params & params) {
+ params.interactive = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"-if", "--interactive-first"},
+ string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
+ [](common_params & params) {
+ params.interactive_first = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"-mli", "--multiline-input"},
+ "allows you to write or paste multiple lines without ending each in '\\'",
+ [](common_params & params) {
+ params.multiline_input = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"--in-prefix-bos"},
+ "prefix BOS to user inputs, preceding the `--in-prefix` string",
+ [](common_params & params) {
+ params.input_prefix_bos = true;
+ params.enable_chat_template = false;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"--in-prefix"}, "STRING",
+ "string to prefix user inputs with (default: empty)",
+ [](common_params & params, const std::string & value) {
+ params.input_prefix = value;
+ params.enable_chat_template = false;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"--in-suffix"}, "STRING",
+ "string to suffix after user inputs with (default: empty)",
+ [](common_params & params, const std::string & value) {
+ params.input_suffix = value;
+ params.enable_chat_template = false;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"--warmup"},
+ {"--no-warmup"},
+ string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.warmup = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--spm-infill"},
+ string_format(
+ "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
+ params.spm_infill ? "enabled" : "disabled"
+ ),
+ [](common_params & params) {
+ params.spm_infill = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--samplers"}, "SAMPLERS",
+ string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
+ [](common_params & params, const std::string & value) {
+ const auto sampler_names = string_split(value, ';');
+ params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"-s", "--seed"}, "SEED",
+ string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
+ [](common_params & params, const std::string & value) {
+ params.sampling.seed = std::stoul(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
+ string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.sampling.samplers = common_sampler_types_from_chars(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--ignore-eos"},
+ "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
+ [](common_params & params) {
+ params.sampling.ignore_eos = true;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--temp"}, "N",
+ string_format("temperature (default: %.1f)", (double)params.sampling.temp),
+ [](common_params & params, const std::string & value) {
+ params.sampling.temp = std::stof(value);
+ params.sampling.temp = std::max(params.sampling.temp, 0.0f);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--top-k"}, "N",
+ string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
+ [](common_params & params, int value) {
+ params.sampling.top_k = value;
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
+ }
+ ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
+ add_opt(common_arg(
+ {"--top-p"}, "N",
+ string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
+ [](common_params & params, const std::string & value) {
+ params.sampling.top_p = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--min-p"}, "N",
+ string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
+ [](common_params & params, const std::string & value) {
+ params.sampling.min_p = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--top-nsigma"}, "N",
+ string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
+ [](common_params & params, const std::string & value) {
+ params.sampling.top_n_sigma = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--xtc-probability"}, "N",
+ string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
+ [](common_params & params, const std::string & value) {
+ params.sampling.xtc_probability = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--xtc-threshold"}, "N",
+ string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
+ [](common_params & params, const std::string & value) {
+ params.sampling.xtc_threshold = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--typical"}, "N",
+ string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
+ [](common_params & params, const std::string & value) {
+ params.sampling.typ_p = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--repeat-last-n"}, "N",
+ string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
+ [](common_params & params, int value) {
+ if (value < -1) {
+ throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
+ }
+ params.sampling.penalty_last_n = value;
+ params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--repeat-penalty"}, "N",
+ string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
+ [](common_params & params, const std::string & value) {
+ params.sampling.penalty_repeat = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--presence-penalty"}, "N",
+ string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
+ [](common_params & params, const std::string & value) {
+ params.sampling.penalty_present = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--frequency-penalty"}, "N",
+ string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
+ [](common_params & params, const std::string & value) {
+ params.sampling.penalty_freq = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--dry-multiplier"}, "N",
+ string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
+ [](common_params & params, const std::string & value) {
+ params.sampling.dry_multiplier = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--dry-base"}, "N",
+ string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
+ [](common_params & params, const std::string & value) {
+ float potential_base = std::stof(value);
+ if (potential_base >= 1.0f)
+ {
+ params.sampling.dry_base = potential_base;
+ }
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--dry-allowed-length"}, "N",
+ string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
+ [](common_params & params, int value) {
+ params.sampling.dry_allowed_length = value;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--dry-penalty-last-n"}, "N",
+ string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
+ [](common_params & params, int value) {
+ if (value < -1) {
+ throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
+ }
+ params.sampling.dry_penalty_last_n = value;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--dry-sequence-breaker"}, "STRING",
+ string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
+ params.sampling.dry_sequence_breakers.empty() ? "none" :
+ std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
+ params.sampling.dry_sequence_breakers.end(),
+ std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
+ [](const std::string& a, const std::string& b) {
+ std::string formatted_b = (b == "\n") ? "\\n" : b;
+ return a + ", '" + formatted_b + "'";
+ }).c_str()),
+ [](common_params & params, const std::string & value) {
+ static bool defaults_cleared = false;
+
+ if (!defaults_cleared) {
+ params.sampling.dry_sequence_breakers.clear();
+ defaults_cleared = true;
+ }
+
+ if (value == "none") {
+ params.sampling.dry_sequence_breakers.clear();
+ } else {
+ params.sampling.dry_sequence_breakers.emplace_back(value);
+ }
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--dynatemp-range"}, "N",
+ string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
+ [](common_params & params, const std::string & value) {
+ params.sampling.dynatemp_range = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--dynatemp-exp"}, "N",
+ string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
+ [](common_params & params, const std::string & value) {
+ params.sampling.dynatemp_exponent = std::stof(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--mirostat"}, "N",
+ string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
+ [](common_params & params, int value) {
+ params.sampling.mirostat = value;
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--mirostat-lr"}, "N",
+ string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
+ [](common_params & params, const std::string & value) {
+ params.sampling.mirostat_eta = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--mirostat-ent"}, "N",
+ string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
+ [](common_params & params, const std::string & value) {
+ params.sampling.mirostat_tau = std::stof(value);
+ params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
+ "modifies the likelihood of token appearing in the completion,\n"
+ "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+ "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
+ [](common_params & params, const std::string & value) {
+ std::stringstream ss(value);
+ llama_token key;
+ char sign;
+ std::string value_str;
+ try {
+ if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+ const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+ params.sampling.logit_bias.push_back({key, bias});
+ } else {
+ throw std::invalid_argument("invalid input format");
+ }
+ } catch (const std::exception&) {
+ throw std::invalid_argument("invalid input format");
+ }
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--grammar"}, "GRAMMAR",
+ string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.sampling.grammar = value;
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"--grammar-file"}, "FNAME",
+ "file to read grammar from",
+ [](common_params & params, const std::string & value) {
+ params.sampling.grammar = read_file(value);
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"-j", "--json-schema"}, "SCHEMA",
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
+ [](common_params & params, const std::string & value) {
+ params.sampling.grammar = json_schema_to_grammar(json::parse(value));
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"-jf", "--json-schema-file"}, "FILE",
+ "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
+ [](common_params & params, const std::string & value) {
+ std::ifstream file(value);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+ }
+ std::string schema;
+ std::copy(
+ std::istreambuf_iterator(file),
+ std::istreambuf_iterator(),
+ std::back_inserter(schema)
+ );
+ params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
+ }
+ ).set_sparam());
+ add_opt(common_arg(
+ {"-bs", "--backend-sampling"},
+ "enable backend sampling (experimental) (default: disabled)",
+ [](common_params & params) {
+ params.sampling.backend_sampling = true;
+ }
+ ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
+ add_opt(common_arg(
+ {"--pooling"}, "{none,mean,cls,last,rank}",
+ "pooling type for embeddings, use model default if unspecified",
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+ else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+ else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+ else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+ else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
+ else { throw std::invalid_argument("invalid value"); }
+ }
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
+ add_opt(common_arg(
+ {"--attention"}, "{causal,non-causal}",
+ "attention type for embeddings, use model default if unspecified",
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+ else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+ else { throw std::invalid_argument("invalid value"); }
+ }
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+ add_opt(common_arg(
+ {"--rope-scaling"}, "{none,linear,yarn}",
+ "RoPE frequency scaling method, defaults to linear unless specified by the model",
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+ else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+ else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+ else { throw std::invalid_argument("invalid value"); }
+ }
+ ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
+ add_opt(common_arg(
+ {"--rope-scale"}, "N",
+ "RoPE context scaling factor, expands context by a factor of N",
+ [](common_params & params, const std::string & value) {
+ params.rope_freq_scale = 1.0f / std::stof(value);
+ }
+ ).set_env("LLAMA_ARG_ROPE_SCALE"));
+ add_opt(common_arg(
+ {"--rope-freq-base"}, "N",
+ "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
+ [](common_params & params, const std::string & value) {
+ params.rope_freq_base = std::stof(value);
+ }
+ ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
+ add_opt(common_arg(
+ {"--rope-freq-scale"}, "N",
+ "RoPE frequency scaling factor, expands context by a factor of 1/N",
+ [](common_params & params, const std::string & value) {
+ params.rope_freq_scale = std::stof(value);
+ }
+ ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
+ add_opt(common_arg(
+ {"--yarn-orig-ctx"}, "N",
+ string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
+ [](common_params & params, int value) {
+ params.yarn_orig_ctx = value;
+ }
+ ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
+ add_opt(common_arg(
+ {"--yarn-ext-factor"}, "N",
+ string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
+ [](common_params & params, const std::string & value) {
+ params.yarn_ext_factor = std::stof(value);
+ }
+ ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
+ add_opt(common_arg(
+ {"--yarn-attn-factor"}, "N",
+ string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
+ [](common_params & params, const std::string & value) {
+ params.yarn_attn_factor = std::stof(value);
+ }
+ ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
+ add_opt(common_arg(
+ {"--yarn-beta-slow"}, "N",
+ string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
+ [](common_params & params, const std::string & value) {
+ params.yarn_beta_slow = std::stof(value);
+ }
+ ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
+ add_opt(common_arg(
+ {"--yarn-beta-fast"}, "N",
+ string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
+ [](common_params & params, const std::string & value) {
+ params.yarn_beta_fast = std::stof(value);
+ }
+ ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
+ add_opt(common_arg(
+ {"-gan", "--grp-attn-n"}, "N",
+ string_format("group-attention factor (default: %d)", params.grp_attn_n),
+ [](common_params & params, int value) {
+ params.grp_attn_n = value;
+ }
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
+ add_opt(common_arg(
+ {"-gaw", "--grp-attn-w"}, "N",
+ string_format("group-attention width (default: %d)", params.grp_attn_w),
+ [](common_params & params, int value) {
+ params.grp_attn_w = value;
+ }
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
+ add_opt(common_arg(
+ {"-kvo", "--kv-offload"},
+ {"-nkvo", "--no-kv-offload"},
+ string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_kv_offload = !value;
+ }
+ ).set_env("LLAMA_ARG_KV_OFFLOAD"));
+ add_opt(common_arg(
+ {"--repack"},
+ {"-nr", "--no-repack"},
+ string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_extra_bufts = !value;
+ }
+ ).set_env("LLAMA_ARG_REPACK"));
+ add_opt(common_arg(
+ {"--no-host"},
+ "bypass host buffer allowing extra buffers to be used",
+ [](common_params & params) {
+ params.no_host = true;
+ }
+ ).set_env("LLAMA_ARG_NO_HOST"));
+ add_opt(common_arg(
+ {"-ctk", "--cache-type-k"}, "TYPE",
+ string_format(
+ "KV cache data type for K\n"
+ "allowed values: %s\n"
+ "(default: %s)",
+ get_all_kv_cache_types().c_str(),
+ ggml_type_name(params.cache_type_k)
+ ),
+ [](common_params & params, const std::string & value) {
+ params.cache_type_k = kv_cache_type_from_str(value);
+ }
+ ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
+ add_opt(common_arg(
+ {"-ctv", "--cache-type-v"}, "TYPE",
+ string_format(
+ "KV cache data type for V\n"
+ "allowed values: %s\n"
+ "(default: %s)",
+ get_all_kv_cache_types().c_str(),
+ ggml_type_name(params.cache_type_v)
+ ),
+ [](common_params & params, const std::string & value) {
+ params.cache_type_v = kv_cache_type_from_str(value);
+ }
+ ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
+ add_opt(common_arg(
+ {"--hellaswag"},
+ "compute HellaSwag score over random tasks from datafile supplied with -f",
+ [](common_params & params) {
+ params.hellaswag = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--hellaswag-tasks"}, "N",
+ string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
+ [](common_params & params, int value) {
+ params.hellaswag_tasks = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--winogrande"},
+ "compute Winogrande score over random tasks from datafile supplied with -f",
+ [](common_params & params) {
+ params.winogrande = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--winogrande-tasks"}, "N",
+ string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
+ [](common_params & params, int value) {
+ params.winogrande_tasks = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--multiple-choice"},
+ "compute multiple choice score over random tasks from datafile supplied with -f",
+ [](common_params & params) {
+ params.multiple_choice = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--multiple-choice-tasks"}, "N",
+ string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
+ [](common_params & params, int value) {
+ params.multiple_choice_tasks = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--kl-divergence"},
+ "computes KL-divergence to logits provided via --kl-divergence-base",
+ [](common_params & params) {
+ params.kl_divergence = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
+ "set logits file",
+ [](common_params & params, const std::string & value) {
+ params.logits_file = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--ppl-stride"}, "N",
+ string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
+ [](common_params & params, int value) {
+ params.ppl_stride = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"--ppl-output-type"}, "<0|1>",
+ string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
+ [](common_params & params, int value) {
+ params.ppl_output_type = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+ add_opt(common_arg(
+ {"-dt", "--defrag-thold"}, "N",
+ string_format("KV cache defragmentation threshold (DEPRECATED)"),
+ [](common_params & params, const std::string & value) {
+ GGML_UNUSED(params);
+ GGML_UNUSED(value);
+ LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
+ }
+ ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
+ if (ex == LLAMA_EXAMPLE_SERVER) {
+ // this is to make sure this option appears in the server-specific section of the help message
+ add_opt(common_arg(
+ {"-np", "--parallel"}, "N",
+ string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
+ [](common_params & params, int value) {
+ if (value == 0) {
+ throw std::invalid_argument("error: invalid value for n_parallel\n");
+ }
+ params.n_parallel = value;
+ }
+ ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
+ } else {
+ add_opt(common_arg(
+ {"-np", "--parallel"}, "N",
+ string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+ [](common_params & params, int value) {
+ params.n_parallel = value;
+ }
+ ).set_env("LLAMA_ARG_N_PARALLEL"));
+ }
+ add_opt(common_arg(
+ {"-ns", "--sequences"}, "N",
+ string_format("number of sequences to decode (default: %d)", params.n_sequences),
+ [](common_params & params, int value) {
+ params.n_sequences = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
+ add_opt(common_arg(
+ {"-cb", "--cont-batching"},
+ {"-nocb", "--no-cont-batching"},
+ string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.cont_batching = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
+ add_opt(common_arg(
+ {"-mm", "--mmproj"}, "FILE",
+ "path to a multimodal projector file. see tools/mtmd/README.md\n"
+ "note: if -hf is used, this argument can be omitted",
+ [](common_params & params, const std::string & value) {
+ params.mmproj.path = value;
+ }
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
+ add_opt(common_arg(
+ {"-mmu", "--mmproj-url"}, "URL",
+ "URL to a multimodal projector file. see tools/mtmd/README.md",
+ [](common_params & params, const std::string & value) {
+ params.mmproj.url = value;
+ }
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
+ add_opt(common_arg(
+ {"--mmproj-auto"},
+ {"--no-mmproj", "--no-mmproj-auto"},
+ string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_mmproj = !value;
+ }
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
+ add_opt(common_arg(
+ {"--mmproj-offload"},
+ {"--no-mmproj-offload"},
+ string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.mmproj_use_gpu = value;
+ }
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
+ add_opt(common_arg(
+ {"--image", "--audio"}, "FILE",
+ "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ params.image.emplace_back(item);
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"--image-min-tokens"}, "N",
+ "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
+ [](common_params & params, int value) {
+ params.image_min_tokens = value;
+ }
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
+ add_opt(common_arg(
+ {"--image-max-tokens"}, "N",
+ "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
+ [](common_params & params, int value) {
+ params.image_max_tokens = value;
+ }
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
+ if (llama_supports_rpc()) {
+ add_opt(common_arg(
+ {"--rpc"}, "SERVERS",
+ "comma separated list of RPC servers (host:port)",
+ [](common_params & params, const std::string & value) {
+ add_rpc_devices(value);
+ GGML_UNUSED(params);
+ }
+ ).set_env("LLAMA_ARG_RPC"));
+ }
+ add_opt(common_arg(
+ {"--mlock"},
+ "force system to keep model in RAM rather than swapping or compressing",
+ [](common_params & params) {
+ params.use_mlock = true;
+ }
+ ).set_env("LLAMA_ARG_MLOCK"));
+ add_opt(common_arg(
+ {"--mmap"},
+ {"--no-mmap"},
+ string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_mmap = value;
+ if (value) {
+ params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
+ }
+ }
+ ).set_env("LLAMA_ARG_MMAP"));
+ add_opt(common_arg(
+ {"-dio", "--direct-io"},
+ {"-ndio", "--no-direct-io"},
+ string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_direct_io = value;
+ }
+ ).set_env("LLAMA_ARG_DIO"));
+ add_opt(common_arg(
+ {"--numa"}, "TYPE",
+ "attempt optimizations that help on some NUMA systems\n"
+ "- distribute: spread execution evenly over all nodes\n"
+ "- isolate: only spawn threads on CPUs on the node that execution started on\n"
+ "- numactl: use the CPU map provided by numactl\n"
+ "if run without this previously, it is recommended to drop the system page cache before using this\n"
+ "see https://github.com/ggml-org/llama.cpp/issues/1437",
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+ else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+ else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+ else { throw std::invalid_argument("invalid value"); }
+ }
+ ).set_env("LLAMA_ARG_NUMA"));
+ add_opt(common_arg(
+ {"-dev", "--device"}, "",
+ "comma-separated list of devices to use for offloading (none = don't offload)\n"
+ "use --list-devices to see a list of available devices",
+ [](common_params & params, const std::string & value) {
+ params.devices = parse_device_list(value);
+ }
+ ).set_env("LLAMA_ARG_DEVICE"));
+ add_opt(common_arg(
+ {"--list-devices"},
+ "print list of available devices and exit",
+ [](common_params &) {
+ std::vector devices;
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ auto * dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+ devices.push_back(dev);
+ }
+ }
+ printf("Available devices:\n");
+ for (auto * dev : devices) {
+ size_t free, total;
+ ggml_backend_dev_memory(dev, &free, &total);
+ printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+ }
+ exit(0);
+ }
+ ));
+ add_opt(common_arg(
+ {"-ot", "--override-tensor"}, "=,...",
+ "override tensor buffer type", [](common_params & params, const std::string & value) {
+ parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
+ }
+ ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
+ add_opt(common_arg(
+ {"-otd", "--override-tensor-draft"}, "=,...",
+ "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
+ parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"-cmoe", "--cpu-moe"},
+ "keep all Mixture of Experts (MoE) weights in the CPU",
+ [](common_params & params) {
+ params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
+ }
+ ).set_env("LLAMA_ARG_CPU_MOE"));
+ add_opt(common_arg(
+ {"-ncmoe", "--n-cpu-moe"}, "N",
+ "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
+ [](common_params & params, int value) {
+ if (value < 0) {
+ throw std::invalid_argument("invalid value");
+ }
+ for (int i = 0; i < value; ++i) {
+ // keep strings alive and avoid leaking memory by storing them in a static vector
+ static std::list buft_overrides;
+ buft_overrides.push_back(llm_ffn_exps_block_regex(i));
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
+ }
+ }
+ ).set_env("LLAMA_ARG_N_CPU_MOE"));
+ add_opt(common_arg(
+ {"-cmoed", "--cpu-moe-draft"},
+ "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
+ [](common_params & params) {
+ params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
+ add_opt(common_arg(
+ {"-ncmoed", "--n-cpu-moe-draft"}, "N",
+ "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
+ [](common_params & params, int value) {
+ if (value < 0) {
+ throw std::invalid_argument("invalid value");
+ }
+ for (int i = 0; i < value; ++i) {
+ static std::list buft_overrides_draft;
+ buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
+ params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+ GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
+ add_opt(common_arg(
+ {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
+ string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
+ [](common_params & params, const std::string & value) {
+ if (value == "auto") {
+ params.n_gpu_layers = -1;
+ } else if (value == "all") {
+ params.n_gpu_layers = -2;
+ } else {
+ params.n_gpu_layers = std::stoi(value);
+ }
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
+ }
+ }
+ ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+ add_opt(common_arg(
+ {"-sm", "--split-mode"}, "{none,layer,row}",
+ "how to split the model across multiple GPUs, one of:\n"
+ "- none: use one GPU only\n"
+ "- layer (default): split layers and KV across GPUs\n"
+ "- row: split rows across GPUs",
+ [](common_params & params, const std::string & value) {
+ std::string arg_next = value;
+ if (arg_next == "none") {
+ params.split_mode = LLAMA_SPLIT_MODE_NONE;
+ } else if (arg_next == "layer") {
+ params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+ } else if (arg_next == "row") {
+ params.split_mode = LLAMA_SPLIT_MODE_ROW;
+ } else {
+ throw std::invalid_argument("invalid value");
+ }
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
+ }
+ }
+ ).set_env("LLAMA_ARG_SPLIT_MODE"));
+ add_opt(common_arg(
+ {"-ts", "--tensor-split"}, "N0,N1,N2,...",
+ "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
+ [](common_params & params, const std::string & value) {
+ std::string arg_next = value;
+
+ // split string by , and /
+ const std::regex regex{ R"([,/]+)" };
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+ std::vector split_arg{ it, {} };
+ if (split_arg.size() >= llama_max_devices()) {
+ throw std::invalid_argument(
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
+ );
+ }
+ for (size_t i = 0; i < llama_max_devices(); ++i) {
+ if (i < split_arg.size()) {
+ params.tensor_split[i] = std::stof(split_arg[i]);
+ } else {
+ params.tensor_split[i] = 0.0f;
+ }
+ }
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
+ }
+ }
+ ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
+ add_opt(common_arg(
+ {"-mg", "--main-gpu"}, "INDEX",
+ string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
+ [](common_params & params, int value) {
+ params.main_gpu = value;
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
+ }
+ }
+ ).set_env("LLAMA_ARG_MAIN_GPU"));
+ add_opt(common_arg(
+ { "-fit", "--fit" }, "[on|off]",
+ string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
+ [](common_params & params, const std::string & value) {
+ if (is_truthy(value)) {
+ params.fit_params = true;
+ } else if (is_falsey(value)) {
+ params.fit_params = false;
+ } else {
+ throw std::runtime_error(
+ string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
+ }
+ }
+ ).set_env("LLAMA_ARG_FIT"));
+ add_opt(common_arg(
+ { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
+ string_format("target margin per device for --fit, comma-separated list of values, "
+ "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
+ [](common_params & params, const std::string & value) {
+ std::string arg_next = value;
+
+ // split string by , and /
+ const std::regex regex{ R"([,/]+)" };
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+ std::vector split_arg{ it, {} };
+ if (split_arg.size() >= llama_max_devices()) {
+ throw std::invalid_argument(
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
+ );
+ }
+ if (split_arg.size() == 1) {
+ std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
+ return;
+ }
+ for (size_t i = 0; i < split_arg.size(); i++) {
+ params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
+ }
+ }
+ ).set_env("LLAMA_ARG_FIT_TARGET"));
+ add_opt(common_arg(
+ { "-fitc", "--fit-ctx" }, "N",
+ string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
+ [](common_params & params, int value) {
+ params.fit_params_min_ctx = value;
+ }
+ ).set_env("LLAMA_ARG_FIT_CTX"));
+ add_opt(common_arg(
+ {"--check-tensors"},
+ string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
+ [](common_params & params) {
+ params.check_tensors = true;
+ }
+ ));
+ add_opt(common_arg(
+ {"--override-kv"}, "KEY=TYPE:VALUE,...",
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
+ }
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"--op-offload"},
+ {"--no-op-offload"},
+ string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
+ [](common_params & params, bool value) {
+ params.no_op_offload = !value;
+ }
+ ));
+ add_opt(common_arg(
+ {"--lora"}, "FNAME",
+ "path to LoRA adapter (use comma-separated values to load multiple adapters)",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
+ }
+ }
+ // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
+ ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+ add_opt(common_arg(
+ {"--lora-scaled"}, "FNAME:SCALE,...",
+ "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
+ "note: use comma-separated values",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ auto parts = string_split(item, ':');
+ if (parts.size() != 2) {
+ throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
+ }
+ params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
+ }
+ }
+ // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
+ ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+ add_opt(common_arg(
+ {"--control-vector"}, "FNAME",
+ "add a control vector\nnote: use comma-separated values to add multiple control vectors",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ params.control_vectors.push_back({ 1.0f, item, });
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"--control-vector-scaled"}, "FNAME:SCALE,...",
+ "add a control vector with user defined scaling SCALE\n"
+ "note: use comma-separated values (format: FNAME:SCALE,...)",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ auto parts = string_split(item, ':');
+ if (parts.size() != 2) {
+ throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
+ }
+ params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
+ }
+ }
+ ));
+ add_opt(common_arg(
+ {"--control-vector-layer-range"}, "START", "END",
+ "layer range to apply the control vector(s) to, start and end inclusive",
+ [](common_params & params, const std::string & start, const std::string & end) {
+ params.control_vector_layer_start = std::stoi(start);
+ params.control_vector_layer_end = std::stoi(end);
+ }
+ ));
+ add_opt(common_arg(
+ {"-a", "--alias"}, "STRING",
+ "set alias for model name (to be used by REST API)",
+ [](common_params & params, const std::string & value) {
+ params.model_alias = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
+ add_opt(common_arg(
+ {"-m", "--model"}, "FNAME",
+ ex == LLAMA_EXAMPLE_EXPORT_LORA
+ ? "model path from which to load base model"
+ : "model path to load",
+ [](common_params & params, const std::string & value) {
+ params.model.path = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+ add_opt(common_arg(
+ {"-mu", "--model-url"}, "MODEL_URL",
+ "model download url (default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.model.url = value;
+ }
+ ).set_env("LLAMA_ARG_MODEL_URL"));
+ add_opt(common_arg(
+ { "-dr", "--docker-repo" }, "[/][:quant]",
+ "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
+ "example: gemma3\n"
+ "(default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.model.docker_repo = value;
+ }
+ ).set_env("LLAMA_ARG_DOCKER_REPO"));
+ add_opt(common_arg(
+ {"-hf", "-hfr", "--hf-repo"}, "/[:quant]",
+ "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
+ "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
+ "example: unsloth/phi-4-GGUF:q4_k_m\n"
+ "(default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.model.hf_repo = value;
+ }
+ ).set_env("LLAMA_ARG_HF_REPO"));
+ add_opt(common_arg(
+ {"-hfd", "-hfrd", "--hf-repo-draft"}, "/[:quant]",
+ "Same as --hf-repo, but for the draft model (default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.speculative.model.hf_repo = value;
+ }
+ ).set_env("LLAMA_ARG_HFD_REPO"));
+ add_opt(common_arg(
+ {"-hff", "--hf-file"}, "FILE",
+ "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.model.hf_file = value;
+ }
+ ).set_env("LLAMA_ARG_HF_FILE"));
+ add_opt(common_arg(
+ {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]",
+ "Hugging Face model repository for the vocoder model (default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.vocoder.model.hf_repo = value;
+ }
+ ).set_env("LLAMA_ARG_HF_REPO_V"));
+ add_opt(common_arg(
+ {"-hffv", "--hf-file-v"}, "FILE",
+ "Hugging Face model file for the vocoder model (default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.vocoder.model.hf_file = value;
+ }
+ ).set_env("LLAMA_ARG_HF_FILE_V"));
+ add_opt(common_arg(
+ {"-hft", "--hf-token"}, "TOKEN",
+ "Hugging Face access token (default: value from HF_TOKEN environment variable)",
+ [](common_params & params, const std::string & value) {
+ params.hf_token = value;
+ }
+ ).set_env("HF_TOKEN"));
+ add_opt(common_arg(
+ {"--context-file"}, "FNAME",
+ "file to load context from (use comma-separated values to specify multiple files)",
+ [](common_params & params, const std::string & value) {
+ for (const auto & item : parse_csv_row(value)) {
+ std::ifstream file(item, std::ios::binary);
+ if (!file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+ }
+ params.context_files.push_back(item);
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+ add_opt(common_arg(
+ {"--chunk-size"}, "N",
+ string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
+ [](common_params & params, int value) {
+ params.chunk_size = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+ add_opt(common_arg(
+ {"--chunk-separator"}, "STRING",
+ string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.chunk_separator = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+ add_opt(common_arg(
+ {"--junk"}, "N",
+ string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
+ [](common_params & params, int value) {
+ params.n_junk = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
+ add_opt(common_arg(
+ {"--pos"}, "N",
+ string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
+ [](common_params & params, int value) {
+ params.i_pos = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+ add_opt(common_arg(
+ {"-o", "--output", "--output-file"}, "FNAME",
+ string_format("output file (default: '%s')", params.out_file.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.out_file = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+ add_opt(common_arg(
+ {"-ofreq", "--output-frequency"}, "N",
+ string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
+ [](common_params & params, int value) {
+ params.n_out_freq = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--output-format"}, "{gguf,dat}",
+ string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "gguf") { params.imat_dat = -1; }
+ else if (value == "dat") { params.imat_dat = 1; }
+ else { throw std::invalid_argument("invalid output format"); }
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--save-frequency"}, "N",
+ string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
+ [](common_params & params, int value) {
+ params.n_save_freq = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--process-output"},
+ string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
+ [](common_params & params) {
+ params.process_output = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--ppl"},
+ {"--no-ppl"},
+ string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.compute_ppl = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--chunk", "--from-chunk"}, "N",
+ string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
+ [](common_params & params, int value) {
+ params.i_chunk = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--show-statistics"},
+ string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
+ [](common_params & params) {
+ params.show_statistics = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--parse-special"},
+ string_format("parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
+ [](common_params & params) {
+ params.parse_special = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"-pps"},
+ string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
+ [](common_params & params) {
+ params.is_pp_shared = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
+ add_opt(common_arg(
+ {"-tgs"},
+ string_format("is the text generation separated across the different sequences (default: %s)", params.is_tg_separate ? "true" : "false"),
+ [](common_params & params) {
+ params.is_tg_separate = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
+ add_opt(common_arg(
+ {"-npp"}, "n0,n1,...",
+ "number of prompt tokens",
+ [](common_params & params, const std::string & value) {
+ auto p = string_split(value, ',');
+ params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+ }
+ ).set_examples({LLAMA_EXAMPLE_BENCH}));
+ add_opt(common_arg(
+ {"-ntg"}, "n0,n1,...",
+ "number of text generation tokens",
+ [](common_params & params, const std::string & value) {
+ auto p = string_split(value, ',');
+ params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+ }
+ ).set_examples({LLAMA_EXAMPLE_BENCH}));
+ add_opt(common_arg(
+ {"-npl"}, "n0,n1,...",
+ "number of parallel prompts",
+ [](common_params & params, const std::string & value) {
+ auto p = string_split(value, ',');
+ params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+ }
+ ).set_examples({LLAMA_EXAMPLE_BENCH}));
+ add_opt(common_arg(
+ {"--embd-normalize"}, "N",
+ string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+ [](common_params & params, int value) {
+ params.embd_normalize = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--embd-output-format"}, "FORMAT",
+ "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
+ [](common_params & params, const std::string & value) {
+ params.embd_out = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+ add_opt(common_arg(
+ {"--embd-separator"}, "STRING",
+ "separator of embeddings (default \\n) for example \"<#sep#>\"",
+ [](common_params & params, const std::string & value) {
+ params.embd_sep = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+ add_opt(common_arg(
+ {"--cls-separator"}, "STRING",
+ "separator of classification sequences (default \\t) for example \"<#seq#>\"",
+ [](common_params & params, const std::string & value) {
+ params.cls_sep = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+ add_opt(common_arg(
+ {"--host"}, "HOST",
+ string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.hostname = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
+ add_opt(common_arg(
+ {"--port"}, "PORT",
+ string_format("port to listen (default: %d)", params.port),
+ [](common_params & params, int value) {
+ params.port = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+ add_opt(common_arg(
+ {"--path"}, "PATH",
+ string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.public_path = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
+ add_opt(common_arg(
+ {"--api-prefix"}, "PREFIX",
+ string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.api_prefix = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+ add_opt(common_arg(
+ {"--webui-config"}, "JSON",
+ "JSON that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+ add_opt(common_arg(
+ {"--webui-config-file"}, "PATH",
+ "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+ [](common_params & params, const std::string & value) {
+ params.webui_config_json = read_file(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
+ add_opt(common_arg(
+ {"--webui"},
+ {"--no-webui"},
+ string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.webui = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
+ add_opt(common_arg(
+ {"--embedding", "--embeddings"},
+ string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
+ [](common_params & params) {
+ params.embedding = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
+ add_opt(common_arg(
+ {"--rerank", "--reranking"},
+ string_format("enable reranking endpoint on server (default: %s)", "disabled"),
+ [](common_params & params) {
+ params.embedding = true;
+ params.pooling_type = LLAMA_POOLING_TYPE_RANK;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
+ add_opt(common_arg(
+ {"--api-key"}, "KEY",
+ "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
+ [](common_params & params, const std::string & value) {
+ for (const auto & key : parse_csv_row(value)) {
+ if (!key.empty()) {
+ params.api_keys.push_back(key);
+ }
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
+ add_opt(common_arg(
+ {"--api-key-file"}, "FNAME",
+ "path to file containing API keys (default: none)",
+ [](common_params & params, const std::string & value) {
+ std::ifstream key_file(value);
+ if (!key_file) {
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+ }
+ std::string key;
+ while (std::getline(key_file, key)) {
+ if (!key.empty()) {
+ params.api_keys.push_back(key);
+ }
+ }
+ key_file.close();
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--ssl-key-file"}, "FNAME",
+ "path to file a PEM-encoded SSL private key",
+ [](common_params & params, const std::string & value) {
+ params.ssl_file_key = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
+ add_opt(common_arg(
+ {"--ssl-cert-file"}, "FNAME",
+ "path to file a PEM-encoded SSL certificate",
+ [](common_params & params, const std::string & value) {
+ params.ssl_file_cert = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
+ add_opt(common_arg(
+ {"--chat-template-kwargs"}, "STRING",
+ "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
+ [](common_params & params, const std::string & value) {
+ auto parsed = json::parse(value);
+ for (const auto & item : parsed.items()) {
+ params.default_template_kwargs[item.key()] = item.value().dump();
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+ add_opt(common_arg(
+ {"-to", "--timeout"}, "N",
+ string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
+ [](common_params & params, int value) {
+ params.timeout_read = value;
+ params.timeout_write = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
+ add_opt(common_arg(
+ {"--threads-http"}, "N",
+ string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
+ [](common_params & params, int value) {
+ params.n_threads_http = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+ add_opt(common_arg(
+ {"--cache-reuse"}, "N",
+ string_format(
+ "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
+ "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
+ ),
+ [](common_params & params, int value) {
+ params.n_cache_reuse = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
+ add_opt(common_arg(
+ {"--metrics"},
+ string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
+ [](common_params & params) {
+ params.endpoint_metrics = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+ add_opt(common_arg(
+ {"--props"},
+ string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
+ [](common_params & params) {
+ params.endpoint_props = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
+ add_opt(common_arg(
+ {"--slots"},
+ {"--no-slots"},
+ string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.endpoint_slots = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+ add_opt(common_arg(
+ {"--slot-save-path"}, "PATH",
+ "path to save slot kv cache (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.slot_save_path = value;
+ if (!fs_is_directory(params.slot_save_path)) {
+ throw std::invalid_argument("not a directory: " + value);
+ }
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
+ if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+ params.slot_save_path += DIRECTORY_SEPARATOR;
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--media-path"}, "PATH",
+ "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.media_path = value;
+ if (!fs_is_directory(params.media_path)) {
+ throw std::invalid_argument("not a directory: " + value);
+ }
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
+ if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
+ params.media_path += DIRECTORY_SEPARATOR;
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--models-dir"}, "PATH",
+ "directory containing models for the router server (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.models_dir = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
+ add_opt(common_arg(
+ {"--models-preset"}, "PATH",
+ "path to INI file containing model presets for the router server (default: disabled)",
+ [](common_params & params, const std::string & value) {
+ params.models_preset = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
+ add_opt(common_arg(
+ {"--models-max"}, "N",
+ string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
+ [](common_params & params, int value) {
+ params.models_max = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+ add_opt(common_arg(
+ {"--models-autoload"},
+ {"--no-models-autoload"},
+ string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.models_autoload = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
+ add_opt(common_arg(
+ {"--jinja"},
+ {"--no-jinja"},
+ string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_jinja = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+ add_opt(common_arg(
+ {"--reasoning-format"}, "FORMAT",
+ "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
+ "- none: leaves thoughts unparsed in `message.content`\n"
+ "- deepseek: puts thoughts in `message.reasoning_content`\n"
+ "- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`\n"
+ "(default: auto)",
+ [](common_params & params, const std::string & value) {
+ params.reasoning_format = common_reasoning_format_from_name(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
+ add_opt(common_arg(
+ {"--reasoning-budget"}, "N",
+ "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+ [](common_params & params, int value) {
+ if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+ params.reasoning_budget = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
+ add_opt(common_arg(
+ {"--chat-template"}, "JINJA_TEMPLATE",
+ string_format(
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
+ "if suffix/prefix are specified, template will be disabled\n"
+ "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
+ "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
+ ),
+ [](common_params & params, const std::string & value) {
+ params.chat_template = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+ add_opt(common_arg(
+ {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
+ string_format(
+ "set custom jinja chat template file (default: template taken from model's metadata)\n"
+ "if suffix/prefix are specified, template will be disabled\n"
+ "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
+ "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
+ ),
+ [](common_params & params, const std::string & value) {
+ params.chat_template = read_file(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+ add_opt(common_arg(
+ {"--prefill-assistant"},
+ {"--no-prefill-assistant"},
+ string_format(
+ "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+ "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
+ ),
+ [](common_params & params, bool value) {
+ params.prefill_assistant = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
+ add_opt(common_arg(
+ {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
+ string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
+ [](common_params & params, const std::string & value) {
+ params.slot_prompt_similarity = std::stof(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--lora-init-without-apply"},
+ string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
+ [](common_params & params) {
+ params.lora_init_without_apply = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--sleep-idle-seconds"}, "SECONDS",
+ string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
+ [](common_params & params, int value) {
+ if (value == 0 || value < -1) {
+ throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
+ }
+ params.sleep_idle_seconds = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--simple-io"},
+ "use basic IO for better compatibility in subprocesses and limited consoles",
+ [](common_params & params) {
+ params.simple_io = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"--positive-file"}, "FNAME",
+ string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.cvector_positive_file = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+ add_opt(common_arg(
+ {"--negative-file"}, "FNAME",
+ string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.cvector_negative_file = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+ add_opt(common_arg(
+ {"--pca-batch"}, "N",
+ string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
+ [](common_params & params, int value) {
+ params.n_pca_batch = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+ add_opt(common_arg(
+ {"--pca-iter"}, "N",
+ string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
+ [](common_params & params, int value) {
+ params.n_pca_iterations = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+ add_opt(common_arg(
+ {"--method"}, "{pca, mean}",
+ "dimensionality reduction method to be used (default: pca)",
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+ else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+ else { throw std::invalid_argument("invalid value"); }
+ }
+ ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+ add_opt(common_arg(
+ {"--output-format"}, "{md,jsonl}",
+ "output format for batched-bench results (default: md)",
+ [](common_params & params, const std::string & value) {
+ /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
+ else if (value == "md") { params.batched_bench_output_jsonl = false; }
+ else { throw std::invalid_argument("invalid value"); }
+ }
+ ).set_examples({LLAMA_EXAMPLE_BENCH}));
+ add_opt(common_arg(
+ {"--log-disable"},
+ "Log disable",
+ [](common_params &) {
+ common_log_pause(common_log_main());
+ }
+ ));
+ add_opt(common_arg(
+ {"--log-file"}, "FNAME",
+ "Log to file",
+ [](common_params &, const std::string & value) {
+ common_log_set_file(common_log_main(), value.c_str());
+ }
+ ).set_env("LLAMA_LOG_FILE"));
+ add_opt(common_arg(
+ {"--log-colors"}, "[on|off|auto]",
+ "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
+ "'auto' enables colors when output is to a terminal",
+ [](common_params &, const std::string & value) {
+ if (is_truthy(value)) {
+ common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
+ } else if (is_falsey(value)) {
+ common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
+ } else if (is_autoy(value)) {
+ common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
+ } else {
+ throw std::invalid_argument(
+ string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
+ }
+ }
+ ).set_env("LLAMA_LOG_COLORS"));
+ add_opt(common_arg(
+ {"-v", "--verbose", "--log-verbose"},
+ "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
+ [](common_params & params) {
+ params.verbosity = INT_MAX;
+ }
+ ));
+ add_opt(common_arg(
+ {"--offline"},
+ "Offline mode: forces use of cache, prevents network access",
+ [](common_params & params) {
+ params.offline = true;
+ }
+ ).set_env("LLAMA_OFFLINE"));
+ add_opt(common_arg(
+ {"-lv", "--verbosity", "--log-verbosity"}, "N",
+ string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
+ " - 0: generic output\n"
+ " - 1: error\n"
+ " - 2: warning\n"
+ " - 3: info\n"
+ " - 4: debug\n"
+ "(default: %d)\n", params.verbosity),
+ [](common_params & params, int value) {
+ params.verbosity = value;
+ }
+ ).set_env("LLAMA_LOG_VERBOSITY"));
+ add_opt(common_arg(
+ {"--log-prefix"},
+ "Enable prefix in log messages",
+ [](common_params &) {
+ common_log_set_prefix(common_log_main(), true);
+ }
+ ).set_env("LLAMA_LOG_PREFIX"));
+ add_opt(common_arg(
+ {"--log-timestamps"},
+ "Enable timestamps in log messages",
+ [](common_params &) {
+ common_log_set_timestamps(common_log_main(), true);
+ }
+ ).set_env("LLAMA_LOG_TIMESTAMPS"));
+
+ // speculative parameters
+ add_opt(common_arg(
+ {"-td", "--threads-draft"}, "N",
+ "number of threads to use during generation (default: same as --threads)",
+ [](common_params & params, int value) {
+ params.speculative.cpuparams.n_threads = value;
+ if (params.speculative.cpuparams.n_threads <= 0) {
+ params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"-tbd", "--threads-batch-draft"}, "N",
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
+ [](common_params & params, int value) {
+ params.speculative.cpuparams_batch.n_threads = value;
+ if (params.speculative.cpuparams_batch.n_threads <= 0) {
+ params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"-Cd", "--cpu-mask-draft"}, "M",
+ "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+ [](common_params & params, const std::string & mask) {
+ params.speculative.cpuparams.mask_valid = true;
+ if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
+ throw std::invalid_argument("invalid cpumask");
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"-Crd", "--cpu-range-draft"}, "lo-hi",
+ "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
+ [](common_params & params, const std::string & range) {
+ params.speculative.cpuparams.mask_valid = true;
+ if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
+ throw std::invalid_argument("invalid range");
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"--cpu-strict-draft"}, "<0|1>",
+ "Use strict CPU placement for draft model (default: same as --cpu-strict)",
+ [](common_params & params, int value) {
+ params.speculative.cpuparams.strict_cpu = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"--prio-draft"}, "N",
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
+ [](common_params & params, int prio) {
+ if (prio < 0 || prio > 3) {
+ throw std::invalid_argument("invalid value");
+ }
+ params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"--poll-draft"}, "<0|1>",
+ "Use polling to wait for draft model work (default: same as --poll])",
+ [](common_params & params, int value) {
+ params.speculative.cpuparams.poll = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"-Cbd", "--cpu-mask-batch-draft"}, "M",
+ "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+ [](common_params & params, const std::string & mask) {
+ params.speculative.cpuparams_batch.mask_valid = true;
+ if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
+ throw std::invalid_argument("invalid cpumask");
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
+ "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
+ [](common_params & params, const std::string & range) {
+ params.speculative.cpuparams_batch.mask_valid = true;
+ if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
+ throw std::invalid_argument("invalid cpumask");
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"--cpu-strict-batch-draft"}, "<0|1>",
+ "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
+ [](common_params & params, int value) {
+ params.speculative.cpuparams_batch.strict_cpu = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"--prio-batch-draft"}, "N",
+ string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
+ [](common_params & params, int prio) {
+ if (prio < 0 || prio > 3) {
+ throw std::invalid_argument("invalid value");
+ }
+ params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"--poll-batch-draft"}, "<0|1>",
+ "Use polling to wait for draft model work (default: --poll-draft)",
+ [](common_params & params, int value) {
+ params.speculative.cpuparams_batch.poll = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+ add_opt(common_arg(
+ {"--draft", "--draft-n", "--draft-max"}, "N",
+ string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
+ [](common_params & params, int value) {
+ params.speculative.n_max = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
+ add_opt(common_arg(
+ {"--draft-min", "--draft-n-min"}, "N",
+ string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
+ [](common_params & params, int value) {
+ params.speculative.n_min = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
+ add_opt(common_arg(
+ {"--draft-p-split"}, "P",
+ string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
+ [](common_params & params, const std::string & value) {
+ params.speculative.p_split = std::stof(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
+ add_opt(common_arg(
+ {"--draft-p-min"}, "P",
+ string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
+ [](common_params & params, const std::string & value) {
+ params.speculative.p_min = std::stof(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+ add_opt(common_arg(
+ {"-cd", "--ctx-size-draft"}, "N",
+ string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
+ [](common_params & params, int value) {
+ params.speculative.n_ctx = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
+ add_opt(common_arg(
+ {"-devd", "--device-draft"}, "",
+ "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
+ "use --list-devices to see a list of available devices",
+ [](common_params & params, const std::string & value) {
+ params.speculative.devices = parse_device_list(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+ GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
+ add_opt(common_arg(
+ {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
+ string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
+ params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
+ [](common_params & params, const std::string & value) {
+ if (value == "auto") {
+ params.speculative.n_gpu_layers = -1;
+ } else if (value == "all") {
+ params.speculative.n_gpu_layers = -2;
+ } else {
+ params.speculative.n_gpu_layers = std::stoi(value);
+ }
+ if (!llama_supports_gpu_offload()) {
+ fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
+ fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
+ }
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
+ add_opt(common_arg(
+ {"-md", "--model-draft"}, "FNAME",
+ "draft model for speculative decoding (default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.speculative.model.path = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+ add_opt(common_arg(
+ {"--spec-replace"}, "TARGET", "DRAFT",
+ "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
+ [](common_params & params, const std::string & tgt, const std::string & dft) {
+ params.speculative.replacements.push_back({ tgt, dft });
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+ add_opt(common_arg(
+ {"-ctkd", "--cache-type-k-draft"}, "TYPE",
+ string_format(
+ "KV cache data type for K for the draft model\n"
+ "allowed values: %s\n"
+ "(default: %s)",
+ get_all_kv_cache_types().c_str(),
+ ggml_type_name(params.speculative.cache_type_k)
+ ),
+ [](common_params & params, const std::string & value) {
+ params.speculative.cache_type_k = kv_cache_type_from_str(value);
+ }
+ ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
+ add_opt(common_arg(
+ {"-ctvd", "--cache-type-v-draft"}, "TYPE",
+ string_format(
+ "KV cache data type for V for the draft model\n"
+ "allowed values: %s\n"
+ "(default: %s)",
+ get_all_kv_cache_types().c_str(),
+ ggml_type_name(params.speculative.cache_type_v)
+ ),
+ [](common_params & params, const std::string & value) {
+ params.speculative.cache_type_v = kv_cache_type_from_str(value);
+ }
+ ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
+
+ add_opt(common_arg(
+ {"-mv", "--model-vocoder"}, "FNAME",
+ "vocoder model for audio generation (default: unused)",
+ [](common_params & params, const std::string & value) {
+ params.vocoder.model.path = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--tts-use-guide-tokens"},
+ "Use guide tokens to improve TTS word recall",
+ [](common_params & params) {
+ params.vocoder.use_guide_tokens = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+ add_opt(common_arg(
+ {"--tts-speaker-file"}, "FNAME",
+ "speaker file path for audio generation",
+ [](common_params & params, const std::string & value) {
+ params.vocoder.speaker_file = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_TTS}));
+
+ add_opt(common_arg(
+ {"--diffusion-steps"}, "N",
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
+ [](common_params & params, int value) { params.diffusion.steps = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ {"--diffusion-visual"},
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
+ [](common_params & params) { params.diffusion.visual_mode = true; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ {"--diffusion-eps"}, "F",
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ {"--diffusion-algorithm"}, "N",
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ {"--diffusion-alg-temp"}, "F",
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ {"--diffusion-block-length"}, "N",
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ {"--diffusion-cfg-scale"}, "F",
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ {"--diffusion-add-gumbel-noise"}, "F",
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "-lr", "--learning-rate" }, "ALPHA",
+ string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
+ [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
+ string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
+ (double) params.lr.lr_min),
+ [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
+ string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
+ [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"-wd", "--weight-decay"}, "WD",
+ string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
+ [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"-val-split", "--val-split"}, "FRACTION",
+ string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
+ [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"-epochs", "--epochs"}, "N",
+ string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
+ [](common_params & params, int epochs) { params.lr.epochs = epochs; }
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
+ [](common_params & params, const std::string & name) {
+ params.optimizer = common_opt_get_optimizer(name.c_str());
+ if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
+ throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
+ }
+ }
+ ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"--save-logits"},
+ string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
+ [](common_params & params) {
+ params.save_logits = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--logits-output-dir"}, "PATH",
+ string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
+ [](common_params & params, const std::string & value) {
+ params.logits_output_dir = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
+ add_opt(common_arg(
+ {"--tensor-filter"}, "REGEX",
+ "filter tensor names for debug output (regex pattern, can be specified multiple times)",
+ [](common_params & params, const std::string & value) {
+ params.tensor_filter.push_back(value);
+ }
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
+
+ // presets
+ add_opt(common_arg(
+ {"--tts-oute-default"},
+ string_format("use default OuteTTS models (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+ params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+ params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
+ params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
+ }
+ ).set_examples({LLAMA_EXAMPLE_TTS}));
+
+ add_opt(common_arg(
+ {"--embd-gemma-default"},
+ string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
+ params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
+ params.port = 8011;
+ params.n_ubatch = 2048;
+ params.n_batch = 2048;
+ params.n_parallel = 32;
+ params.n_ctx = 2048*params.n_parallel;
+ params.verbose_prompt = true;
+ params.embedding = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
+
+ add_opt(common_arg(
+ {"--fim-qwen-1.5b-default"},
+ string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+ params.port = 8012;
+ params.n_ubatch = 1024;
+ params.n_batch = 1024;
+ params.n_ctx = 0;
+ params.n_cache_reuse = 256;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+ add_opt(common_arg(
+ {"--fim-qwen-3b-default"},
+ string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+ params.port = 8012;
+ params.n_ubatch = 1024;
+ params.n_batch = 1024;
+ params.n_ctx = 0;
+ params.n_cache_reuse = 256;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+ add_opt(common_arg(
+ {"--fim-qwen-7b-default"},
+ string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+ params.port = 8012;
+ params.n_ubatch = 1024;
+ params.n_batch = 1024;
+ params.n_ctx = 0;
+ params.n_cache_reuse = 256;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+ add_opt(common_arg(
+ {"--fim-qwen-7b-spec"},
+ string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+ params.port = 8012;
+ params.n_ubatch = 1024;
+ params.n_batch = 1024;
+ params.n_ctx = 0;
+ params.n_cache_reuse = 256;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+ add_opt(common_arg(
+ {"--fim-qwen-14b-spec"},
+ string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
+ params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
+ params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+ params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+ params.port = 8012;
+ params.n_ubatch = 1024;
+ params.n_batch = 1024;
+ params.n_ctx = 0;
+ params.n_cache_reuse = 256;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+ add_opt(common_arg(
+ {"--fim-qwen-30b-default"},
+ string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
+ params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
+ params.port = 8012;
+ params.n_ubatch = 1024;
+ params.n_batch = 1024;
+ params.n_ctx = 0;
+ params.n_cache_reuse = 256;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
+
+ add_opt(common_arg(
+ {"--gpt-oss-20b-default"},
+ string_format("use gpt-oss-20b (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
+ params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
+ params.port = 8013;
+ params.n_ubatch = 2048;
+ params.n_batch = 32768;
+ params.n_parallel = 2;
+ params.n_ctx = 131072*params.n_parallel;
+ params.sampling.temp = 1.0f;
+ params.sampling.top_p = 1.0f;
+ params.sampling.top_k = 0;
+ params.sampling.min_p = 0.01f;
+ params.use_jinja = true;
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+
+ add_opt(common_arg(
+ {"--gpt-oss-120b-default"},
+ string_format("use gpt-oss-120b (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
+ params.port = 8013;
+ params.n_ubatch = 2048;
+ params.n_batch = 32768;
+ params.n_parallel = 2;
+ params.n_ctx = 131072*params.n_parallel;
+ params.sampling.temp = 1.0f;
+ params.sampling.top_p = 1.0f;
+ params.sampling.top_k = 0;
+ params.sampling.min_p = 0.01f;
+ params.use_jinja = true;
+ //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+
+ add_opt(common_arg(
+ {"--vision-gemma-4b-default"},
+ string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
+ params.port = 8014;
+ params.n_ctx = 0;
+ params.use_jinja = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+
+ add_opt(common_arg(
+ {"--vision-gemma-12b-default"},
+ string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
+ [](common_params & params) {
+ params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
+ params.port = 8014;
+ params.n_ctx = 0;
+ params.use_jinja = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+
+ return ctx_arg;
+}
+
+void common_params_add_preset_options(std::vector & args) {
+ // arguments below won't be treated as CLI args, only preset options
+ args.push_back(common_arg(
+ {"load-on-startup"}, "NAME",
+ "in server router mode, autoload this model on startup",
+ [](common_params &, const std::string &) { /* unused */ }
+ ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
+
+ args.push_back(common_arg(
+ {"stop-timeout"}, "SECONDS",
+ "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
+ [](common_params &, int) { /* unused */ }
+ ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
+
+ // args.push_back(common_arg(
+ // {"pin"},
+ // "in server router mode, do not unload this model if models_max is exceeded",
+ // [](common_params &) { /* unused */ }
+ // ).set_preset_only());
+}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/arg.h b/patches/llama-cpp-sys-2/llama.cpp/common/arg.h
new file mode 100644
index 0000000..55782a1
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/arg.h
@@ -0,0 +1,131 @@
+#pragma once
+
+#include "common.h"
+
+#include
+#include token from content: delete all token
+ if (auto pos = content.rfind(end_think); pos != std::string::npos) {
+ while (pos != std::string::npos) {
+ pos = erase_spaces(content, pos, pos + end_think.size() - 1);
+ pos = content.rfind(end_think, pos);
+ }
+ }
+ // Strip if needed
+ if (content.size() > 0 && std::isspace(static_cast(content[0]))) {
+ content = string_strip(content);
+ }
+ }
+
+ // remove potential partial suffix
+ if (builder.pos() == builder.input().size()) {
+ if (unclosed_reasoning_content.empty()) {
+ rstrip(content);
+ trim_potential_partial_word(content);
+ rstrip(content);
+ } else {
+ rstrip(unclosed_reasoning_content);
+ trim_potential_partial_word(unclosed_reasoning_content);
+ rstrip(unclosed_reasoning_content);
+ }
+ }
+
+ // consume unclosed_reasoning_content if allow_toolcall_in_think is set
+ if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
+ if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+ builder.add_reasoning_content(unclosed_reasoning_content);
+ } else {
+ if (content.empty()) {
+ content = start_think + unclosed_reasoning_content;
+ } else {
+ content += "\n\n" + start_think;
+ content += unclosed_reasoning_content;
+ }
+ }
+ unclosed_reasoning_content.clear();
+ }
+
+ // Add content
+ if (!content.empty()) {
+ // If there are multiple content blocks
+ if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
+ builder.add_content("\n\n");
+ }
+ builder.add_content(content);
+ }
+
+ // This start is in thinking block and toolcall_in_think not set, skip this tool call
+ if (toolcall_in_think && !form.allow_toolcall_in_think) {
+ continue;
+ }
+
+ // There is no tool call and all content is parsed
+ if (!tc) {
+ GGML_ASSERT(builder.pos() == builder.input().size());
+ GGML_ASSERT(unclosed_reasoning_content.empty());
+ if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
+ break;
+ }
+
+ builder.move_to(tc->groups[0].begin);
+ if (builder.try_consume_xml_tool_calls(form)) {
+ auto end_of_tool = builder.pos();
+ builder.consume_spaces();
+ if (builder.pos() != builder.input().size()) {
+ builder.move_to(end_of_tool);
+ if (!builder.result().content.empty()) {
+ builder.add_content("\n\n");
+ }
+ }
+ } else {
+ static const common_regex next_char_regex(".");
+ auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
+ rstrip(c);
+ builder.add_content(c);
+ }
+ }
+}
+
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ */
+void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
+ parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
+}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser-xml-toolcall.h b/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser-xml-toolcall.h
new file mode 100644
index 0000000..b309fb6
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser-xml-toolcall.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "chat.h"
+
+#include
+
+#include
+#include
+#include
+
+
+// Sample config:
+// MiniMax-M2 (left): \n\nvalue\n...\n...
+// GLM 4.5 (right): function_name\nkey\nvalue\n
+struct xml_tool_call_format {
+ std::string scope_start; // \n // \n // can be empty
+ std::string tool_start; //
+ std::string tool_sep; // \">\n // \n // can be empty only for parse_xml_tool_calls
+ std::string key_start; //
+ std::string key_val_sep; // \"> // \n
+ std::string val_end; // \n // \n
+ std::string tool_end; // \n // \n
+ std::string scope_end; // // // can be empty
+ // Set this if there can be dynamic spaces inside key_val_sep.
+ // e.g. key_val_sep= key_val_sep2= for GLM4.5
+ std::optional key_val_sep2 = std::nullopt;
+ // Set true if argval should only be raw string. e.g. Hello "world" hi
+ // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
+ // Defaults to std::nullopt, both will be allowed.
+ std::optional raw_argval = std::nullopt;
+ std::optional last_val_end = std::nullopt;
+ std::optional last_tool_end = std::nullopt;
+ bool trim_raw_argval = false;
+ bool allow_toolcall_in_think = false;
+};
+
+// make a GBNF that accept any strings except those containing any of the forbidden strings.
+std::string make_gbnf_excluding(std::vector forbids);
+
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser.cpp b/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser.cpp
new file mode 100644
index 0000000..23e23ca
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser.cpp
@@ -0,0 +1,1554 @@
+#include "chat-parser.h"
+#include "chat-peg-parser.h"
+#include "common.h"
+#include "log.h"
+#include "peg-parser.h"
+#include "regex-partial.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using json = nlohmann::ordered_json;
+
+static void parse_prefixed_json_tool_call_array(common_chat_msg_parser & builder,
+ const common_regex & prefix,
+ size_t rstrip_prefix = 0) {
+ static const std::vector> args_paths = { { "arguments" } };
+ if (auto res = builder.try_find_regex(prefix)) {
+ builder.move_back(rstrip_prefix);
+ auto tool_calls = builder.consume_json_with_dumped_args(args_paths);
+ if (!builder.add_tool_calls(tool_calls.value) || tool_calls.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call array");
+ }
+ } else {
+ builder.add_content(builder.consume_rest());
+ }
+}
+
+static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
+ std::string arguments;
+ if (builder.is_partial()) {
+ arguments = (json{
+ { "code", code + builder.healing_marker() }
+ })
+ .dump();
+ auto idx = arguments.find(builder.healing_marker());
+ if (idx != std::string::npos) {
+ arguments.resize(idx);
+ }
+ } else {
+ arguments = (json{
+ { "code", code }
+ })
+ .dump();
+ }
+ return arguments;
+}
+
+/**
+ * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
+ * Aggregates the prefix, suffix and in-between text into the content.
+ */
+static void parse_json_tool_calls(
+ common_chat_msg_parser & builder,
+ const std::optional & block_open,
+ const std::optional & function_regex_start_only,
+ const std::optional & function_regex,
+ const common_regex & close_regex,
+ const std::optional & block_close,
+ bool allow_raw_python = false,
+ const std::function & get_function_name =
+ nullptr) {
+ auto parse_tool_calls = [&]() {
+ size_t from = std::string::npos;
+ auto first = true;
+ while (true) {
+ auto start_pos = builder.pos();
+ auto res = function_regex_start_only && first ? builder.try_consume_regex(*function_regex_start_only) :
+ function_regex ? builder.try_find_regex(*function_regex, from) :
+ std::nullopt;
+
+ if (res) {
+ std::string name;
+ if (get_function_name) {
+ name = get_function_name(*res);
+ } else {
+ GGML_ASSERT(res->groups.size() == 2);
+ name = builder.str(res->groups[1]);
+ }
+ first = false;
+ if (name.empty()) {
+ // get_function_name signalled us that we should skip this match and treat it as content.
+ from = res->groups[0].begin + 1;
+ continue;
+ }
+ from = std::string::npos;
+
+ auto maybe_raw_python = name == "python" && allow_raw_python;
+ if (builder.input()[builder.pos()] == '{' || !maybe_raw_python) {
+ if (auto arguments = builder.try_consume_json_with_dumped_args({ {} })) {
+ if (!builder.add_tool_call(name, "", arguments->value) || arguments->is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ builder.consume_regex(close_regex);
+ }
+ continue;
+ }
+ if (maybe_raw_python) {
+ auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
+ if (!builder.add_tool_call(name, "", arguments)) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ return;
+ }
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ } else {
+ builder.move_to(start_pos);
+ }
+ break;
+ }
+ if (block_close) {
+ builder.consume_regex(*block_close);
+ }
+ builder.consume_spaces();
+ builder.add_content(builder.consume_rest());
+ };
+ if (block_open) {
+ if (auto res = builder.try_find_regex(*block_open)) {
+ parse_tool_calls();
+ } else {
+ builder.add_content(builder.consume_rest());
+ }
+ } else {
+ parse_tool_calls();
+ }
+}
+
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
+ : input_(input), is_partial_(is_partial), syntax_(syntax)
+{
+ result_.role = "assistant";
+
+ while (true) {
+ std::string id = std::to_string(std::rand());
+ if (input.find(id) == std::string::npos) {
+ healing_marker_ = id;
+ break;
+ }
+ }
+}
+
+std::string common_chat_msg_parser::str(const common_string_range & rng) const {
+ GGML_ASSERT(rng.begin <= rng.end);
+ return input_.substr(rng.begin, rng.end - rng.begin);
+}
+
+void common_chat_msg_parser::add_content(const std::string &content) {
+ result_.content += content;
+}
+
+void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
+ result_.reasoning_content += reasoning_content;
+}
+
+bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
+ if (name.empty()) {
+ return false;
+ }
+
+ common_chat_tool_call tool_call;
+ tool_call.name = name;
+ tool_call.arguments = arguments;
+ tool_call.id = id;
+
+ // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
+ result_.tool_calls.emplace_back(tool_call);
+
+ return true;
+}
+bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
+ std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
+ std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
+ std::string arguments = "";
+ if (tool_call.contains("arguments")) {
+ if (tool_call.at("arguments").is_object()) {
+ arguments = tool_call.at("arguments").dump();
+ } else {
+ arguments = tool_call.at("arguments");
+ }
+ }
+
+ return add_tool_call(name, id, arguments);
+}
+
+bool common_chat_msg_parser::add_tool_calls(const json & arr) {
+ for (const auto & item : arr) {
+ if (!add_tool_call(item)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
+ if (!tool_call.is_object() || tool_call.size() != 1) {
+ return false;
+ }
+
+ // Get the tool name (the single key in the object)
+ auto it = tool_call.begin();
+ std::string name = it.key();
+
+ if (name.empty()) {
+ return false;
+ }
+
+ // Get the arguments (the nested object)
+ const json & args_json = it.value();
+ std::string arguments = "";
+
+ if (args_json.is_object()) {
+ arguments = args_json.dump();
+ } else if (args_json.is_string()) {
+ arguments = args_json;
+ } else if (!args_json.is_null()) {
+ // For other types, convert to string representation
+ arguments = args_json.dump();
+ }
+
+ return add_tool_call(name, "", arguments);
+}
+void common_chat_msg_parser::finish() {
+ if (!is_partial_ && pos_ != input_.size()) {
+ throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
+ }
+}
+
+bool common_chat_msg_parser::consume_spaces() {
+ const auto length = input_.size();
+ auto consumed = false;
+ while (pos_ < length && std::isspace(input_[pos_])) {
+ ++pos_;
+ consumed = true;
+ }
+ return consumed;
+}
+
+bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
+ auto pos = pos_;
+ for (auto i = 0u; i < literal.size(); ++i) {
+ if (pos >= input_.size()) {
+ return false;
+ }
+ if (input_[pos] != literal[i]) {
+ return false;
+ }
+ ++pos;
+ }
+ pos_ = pos;
+ return true;
+}
+
+std::optional common_chat_msg_parser::try_find_literal(const std::string & literal) {
+ auto idx = input_.find(literal, pos_);
+ if (idx != std::string::npos) {
+ find_regex_result res;
+ res.prelude = input_.substr(pos_, idx - pos_);
+ auto end = idx + literal.size();
+ res.groups.emplace_back(common_string_range{idx, end});
+ move_to(end);
+ return res;
+ }
+ if (is_partial_) {
+ idx = string_find_partial_stop(input_, literal);
+ if (idx != std::string::npos && idx >= pos_) {
+ find_regex_result res;
+ res.prelude = input_.substr(pos_, idx - pos_);
+ auto end = input_.size();
+ res.groups.emplace_back(common_string_range{idx, end});
+ move_to(end);
+ return res;
+ }
+ }
+ return std::nullopt;
+}
+
+void common_chat_msg_parser::consume_literal(const std::string & literal) {
+ if (!try_consume_literal(literal)) {
+ throw common_chat_msg_partial_exception(literal);
+ }
+}
+
+bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+ std::string pending_reasoning_prefix;
+
+ if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+ return false;
+ }
+
+ auto set_reasoning_prefix = [&](size_t prefix_pos) {
+ if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+ return;
+ }
+ if (prefix_pos + start_think.size() > input_.size()) {
+ pending_reasoning_prefix.clear();
+ return;
+ }
+ // Capture the exact literal that opened the reasoning section so we can
+ // surface it back to callers. This ensures formats that force the
+ // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+ // instead of dropping it during parsing.
+ pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+ };
+
+ auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
+ auto stripped_reasoning = string_strip(reasoning);
+ if (stripped_reasoning.empty()) {
+ return;
+ }
+ if (syntax_.reasoning_in_content) {
+ add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "" : start_think);
+ add_content(stripped_reasoning);
+ if (closed) {
+ add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "" : end_think);
+ }
+ } else {
+ if (!pending_reasoning_prefix.empty()) {
+ add_reasoning_content(pending_reasoning_prefix);
+ pending_reasoning_prefix.clear();
+ }
+ add_reasoning_content(stripped_reasoning);
+ }
+ };
+
+ const size_t saved_pos = pos_;
+ const size_t saved_content_size = result_.content.size();
+ const size_t saved_reasoning_size = result_.reasoning_content.size();
+
+ auto restore_state = [&]() {
+ move_to(saved_pos);
+ result_.content.resize(saved_content_size);
+ result_.reasoning_content.resize(saved_reasoning_size);
+ };
+
+ // Allow leading whitespace to be preserved as content when reasoning is present at the start
+ size_t cursor = pos_;
+ size_t whitespace_end = cursor;
+ while (whitespace_end < input_.size() && std::isspace(static_cast(input_[whitespace_end]))) {
+ ++whitespace_end;
+ }
+
+ if (whitespace_end >= input_.size()) {
+ restore_state();
+ if (syntax_.thinking_forced_open) {
+ auto rest = input_.substr(saved_pos);
+ if (!rest.empty()) {
+ handle_reasoning(rest, /* closed */ !is_partial());
+ }
+ move_to(input_.size());
+ return true;
+ }
+ return false;
+ }
+
+ cursor = whitespace_end;
+ const size_t remaining = input_.size() - cursor;
+ const size_t start_prefix = std::min(start_think.size(), remaining);
+ const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+
+ if (has_start_tag && start_prefix < start_think.size()) {
+ move_to(input_.size());
+ return true;
+ }
+
+ if (has_start_tag) {
+ if (whitespace_end > pos_) {
+ add_content(input_.substr(pos_, whitespace_end - pos_));
+ }
+ set_reasoning_prefix(cursor);
+ cursor += start_think.size();
+ } else if (syntax_.thinking_forced_open) {
+ cursor = whitespace_end;
+ } else {
+ restore_state();
+ return false;
+ }
+ while (true) {
+ if (cursor >= input_.size()) {
+ move_to(input_.size());
+ return true;
+ }
+
+ size_t end_pos = input_.find(end_think, cursor);
+ if (end_pos == std::string::npos) {
+ std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+ size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+ size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+ if (reasoning_end > cursor) {
+ handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+ }
+ move_to(input_.size());
+ return true;
+ }
+
+ if (end_pos > cursor) {
+ handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+ } else {
+ handle_reasoning("", /* closed */ true);
+ }
+
+ cursor = end_pos + end_think.size();
+
+ while (cursor < input_.size() && std::isspace(static_cast(input_[cursor]))) {
+ ++cursor;
+ }
+
+ const size_t next_remaining = input_.size() - cursor;
+ if (next_remaining == 0) {
+ move_to(cursor);
+ return true;
+ }
+
+ const size_t next_prefix = std::min(start_think.size(), next_remaining);
+ if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+ if (next_prefix < start_think.size()) {
+ move_to(input_.size());
+ return true;
+ }
+ set_reasoning_prefix(cursor);
+ cursor += start_think.size();
+ continue;
+ }
+
+ move_to(cursor);
+ return true;
+ }
+}
+
+std::string common_chat_msg_parser::consume_rest() {
+ auto rest = input_.substr(pos_);
+ pos_ = input_.size();
+ return rest;
+}
+
+// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
+std::optional common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
+ auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
+ if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+ return std::nullopt;
+ }
+ auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
+ pos_ = m.groups[0].end;
+
+ if (add_prelude_to_content) {
+ add_content(prelude);
+ }
+ if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+ if (is_partial()) {
+ throw common_chat_msg_partial_exception(regex.str());
+ }
+ return std::nullopt;
+ }
+ return find_regex_result{prelude, m.groups};
+}
+
+common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
+ if (auto result = try_consume_regex(regex)) {
+ return *result;
+ }
+ throw common_chat_msg_partial_exception(regex.str());
+}
+
+std::optional common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
+ auto m = regex.search(input_, pos_);
+ if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+ return std::nullopt;
+ }
+ if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+ if (is_partial()) {
+ throw common_chat_msg_partial_exception(regex.str());
+ }
+ return std::nullopt;
+ }
+ if (m.groups[0].begin != pos_) {
+ // Didn't match at the current position.
+ return std::nullopt;
+ }
+ pos_ = m.groups[0].end;
+
+ return find_regex_result {
+ /* .prelude = */ "",
+ m.groups,
+ };
+}
+
+std::optional common_chat_msg_parser::try_consume_json() {
+ auto it = input_.cbegin() + pos_;
+ const auto end = input_.cend();
+ common_json result;
+ if (!common_json_parse(it, end, healing_marker_, result)) {
+ return std::nullopt;
+ }
+ pos_ = std::distance(input_.cbegin(), it);
+ if (result.healing_marker.marker.empty()) {
+ // No healing marker, just return the parsed json
+ return result;
+ }
+ if (!is_partial()) {
+ throw common_chat_msg_partial_exception("JSON");
+ }
+ return result;
+}
+
+common_json common_chat_msg_parser::consume_json() {
+ if (auto result = try_consume_json()) {
+ return *result;
+ }
+ throw common_chat_msg_partial_exception("JSON");
+}
+
+common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
+ const std::vector> & args_paths,
+ const std::vector> & content_paths
+) {
+ if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
+ return *result;
+ }
+ throw common_chat_msg_partial_exception("JSON");
+}
+
+std::optional common_chat_msg_parser::try_consume_json_with_dumped_args(
+ const std::vector> & args_paths,
+ const std::vector> & content_paths
+) {
+ auto partial = try_consume_json();
+ if (!partial) {
+ return std::nullopt;
+ }
+ auto is_arguments_path = [&](const std::vector & path) {
+ return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
+ };
+ auto is_content_path = [&](const std::vector & path) {
+ return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
+ };
+
+ if (partial->healing_marker.marker.empty()) {
+ if (args_paths.empty()) {
+ // No arguments to dump, and JSON was parsed fully.
+ return consume_json_result {
+ partial->json,
+ /* .is_partial = */ false,
+ };
+ }
+ if (is_arguments_path({})) {
+ // Entire JSON is the arguments and was parsed fully.
+ return consume_json_result {
+ partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
+ /* .is_partial = */ false,
+ };
+ }
+ }
+
+ LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+
+ auto found_healing_marker = false;
+ std::vector path;
+ std::function remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
+ if (is_arguments_path(path)) {
+ auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
+ if (is_partial() && !partial->healing_marker.marker.empty()) {
+ auto idx = arguments.find(partial->healing_marker.json_dump_marker);
+ if (idx != std::string::npos) {
+ arguments.resize(idx);
+ found_healing_marker = true;
+ }
+ if (arguments == "\"") {
+ // This happens because of completing `:"$magic` after `"arguments"`
+ arguments = "";
+ }
+ }
+ return arguments;
+ }
+ if (is_content_path(path)) {
+ if (!j.is_string()) {
+ throw std::runtime_error("Content path must be a string");
+ }
+ std::string str = j;
+ auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
+ if (idx != std::string::npos) {
+ str.resize(idx);
+ found_healing_marker = true;
+ }
+ return str;
+ }
+ if (j.is_object()) {
+ auto obj = json::object();
+ for (const auto & p : j.items()) {
+ const auto & key = p.key();
+ const auto & value = p.value();
+ const std::string key_str = key; // NOLINT
+ auto idx = key_str.find(healing_marker_);
+ if (idx != std::string::npos) {
+ found_healing_marker = true;
+ break;
+ }
+ path.push_back(key_str);
+ if (value.is_string()) {
+ const std::string value_str = value;
+ if (value_str.find(healing_marker_) != std::string::npos) {
+ found_healing_marker = true;
+ if (is_content_path(path)) {
+ if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
+ // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
+ obj[key] = remove_unsupported_healings_and_dump_args(value);
+ }
+ }
+ break;
+ }
+ obj[key] = value;
+ } else {
+ obj[key] = remove_unsupported_healings_and_dump_args(value);
+ }
+ path.pop_back();
+ }
+ return obj;
+ }
+ if (j.is_array()) {
+ auto arr = json::array();
+ for (const auto & value : j) {
+ if (value.is_string()) {
+ std::string str = value;
+ auto idx = str.find(healing_marker_);
+ if (idx != std::string::npos) {
+ // Don't heal array values that aren't in the arguments.
+ found_healing_marker = true;
+ break;
+ }
+ }
+ arr.push_back(remove_unsupported_healings_and_dump_args(value));
+ }
+ return arr;
+ }
+ return j;
+ };
+
+ auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
+ LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+ return consume_json_result {
+ cleaned,
+ /* .is_partial = */ found_healing_marker,
+ };
+}
+
+void common_chat_msg_parser::clear_tools() {
+ result_.tool_calls.clear();
+}
+
+/**
+ * All common_chat_parse_* moved from chat.cpp to chat-parser.cpp below
+ * to reduce incremental compile time for parser changes.
+ */
+static void common_chat_parse_generic(common_chat_msg_parser & builder) {
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+ static const std::vector> content_paths = {
+ {"response"},
+ };
+ static const std::vector> args_paths = {
+ {"tool_call", "arguments"},
+ {"tool_calls", "arguments"},
+ };
+ auto data = builder.consume_json_with_dumped_args(args_paths, content_paths);
+ if (data.value.contains("tool_calls")) {
+ if (!builder.add_tool_calls(data.value.at("tool_calls")) || data.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool calls");
+ }
+ } else if (data.value.contains("tool_call")) {
+ if (!builder.add_tool_call(data.value.at("tool_call")) || data.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ } else if (data.value.contains("response")) {
+ const auto & response = data.value.at("response");
+ builder.add_content(response.is_string() ? response.template get() : response.dump(2));
+ if (data.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete response");
+ }
+ } else {
+ throw common_chat_msg_partial_exception("Expected 'tool_call', 'tool_calls' or 'response' in JSON");
+ }
+}
+
+static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
+ parse_prefixed_json_tool_call_array(builder, prefix);
+}
+
+static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("[THINK]", "[/THINK]");
+
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
+ parse_prefixed_json_tool_call_array(builder, prefix);
+}
+
+static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
+
+ static const common_regex start_action_regex("<\\|START_ACTION\\|>");
+ static const common_regex end_action_regex("<\\|END_ACTION\\|>");
+ static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
+ static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
+
+ if (auto res = builder.try_find_regex(start_action_regex)) {
+ // If we didn't extract thoughts, prelude includes them.
+ auto tool_calls = builder.consume_json_with_dumped_args({{"parameters"}});
+ for (const auto & tool_call : tool_calls.value) {
+ std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
+ std::string id = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
+ std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
+ if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ }
+ if (tool_calls.is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ builder.consume_regex(end_action_regex);
+ } else if (auto res = builder.try_find_regex(start_response_regex)) {
+ if (!builder.try_find_regex(end_response_regex)) {
+ builder.add_content(builder.consume_rest());
+ throw common_chat_msg_partial_exception(end_response_regex.str());
+ }
+ } else {
+ builder.add_content(builder.consume_rest());
+ }
+}
+
+static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+ builder.try_parse_reasoning("", "");
+
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ static const common_regex function_regex(
+ "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
+ static const common_regex close_regex("\\}\\s*");
+
+ static const common_regex function_name_regex("\\s*(\\w+)\\s*\\.\\s*call\\(");
+ static const common_regex arg_name_regex("\\s*(\\w+)\\s*=\\s*");
+
+ if (with_builtin_tools) {
+ static const common_regex builtin_call_regex("<\\|python_tag\\|>");
+ if (auto res = builder.try_find_regex(builtin_call_regex)) {
+ auto fun_res = builder.consume_regex(function_name_regex);
+ auto function_name = builder.str(fun_res.groups[1]);
+
+ common_healing_marker healing_marker;
+ json args = json::object();
+ while (true) {
+ if (auto arg_res = builder.try_consume_regex(arg_name_regex)) {
+ auto arg_name = builder.str(arg_res->groups[1]);
+ auto partial = builder.consume_json();
+ args[arg_name] = partial.json;
+ healing_marker.marker = partial.healing_marker.marker;
+ healing_marker.json_dump_marker = partial.healing_marker.json_dump_marker;
+ builder.consume_spaces();
+ if (!builder.try_consume_literal(",")) {
+ break;
+ }
+ } else {
+ break;
+ }
+ }
+ builder.consume_literal(")");
+ builder.consume_spaces();
+
+ auto arguments = args.dump();
+ if (!builder.add_tool_call(function_name, "", arguments)) {
+ throw common_chat_msg_partial_exception("Incomplete tool call");
+ }
+ return;
+ }
+ }
+ parse_json_tool_calls(
+ builder,
+ /* block_open= */ std::nullopt,
+ /* function_regex_start_only= */ function_regex,
+ /* function_regex= */ std::nullopt,
+ close_regex,
+ std::nullopt);
+
+}
+
+static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("", "");
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?function<|tool▁sep|>([^\n]+)\n```json\n");
+ static const common_regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
+
+ parse_json_tool_calls(
+ builder,
+ /* block_open= */ tool_calls_begin,
+ /* function_regex_start_only= */ std::nullopt,
+ function_regex,
+ close_regex,
+ tool_calls_end);
+}
+
+static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
+
+ static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
+
+ if (!builder.syntax().parse_tool_calls) {
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
+
+ parse_json_tool_calls(
+ builder,
+ /* block_open= */ tool_calls_begin,
+ /* function_regex_start_only= */ std::nullopt,
+ function_regex,
+ close_regex,
+ tool_calls_end);
+}
+
+static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
+ // DeepSeek V3.1 outputs reasoning content between "" and "" tags, followed by regular content
+ // First try to parse using the standard reasoning parsing method
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
+
+ auto start_pos = builder.pos();
+ auto found_end_think = builder.try_find_literal("");
+ builder.move_to(start_pos);
+
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
+ common_chat_parse_deepseek_v3_1_content(builder);
+ } else if (builder.try_parse_reasoning("", "")) {
+ // If reasoning was parsed successfully, the remaining content is regular content
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
+ // <|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
+ common_chat_parse_deepseek_v3_1_content(builder);
+ } else {
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
+ common_chat_parse_deepseek_v3_1_content(builder);
+ return;
+ }
+ // If no reasoning tags found, check if we should treat everything as reasoning
+ if (builder.syntax().thinking_forced_open) {
+ // If thinking is forced open but no tags found, treat everything as reasoning
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
+ builder.add_reasoning_content(builder.consume_rest());
+ } else {
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
+ // <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
+ common_chat_parse_deepseek_v3_1_content(builder);
+ }
+ }
+}
+
+static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
+ static const xml_tool_call_format form {
+ /* form.scope_start = */ "",
+ /* form.tool_start = */ "",
+ /* form.key_start = */ "",
+ /* form.val_end = */ "",
+ /* form.tool_end = */ "",
+ /* form.scope_end = */ "",
+ };
+ builder.consume_reasoning_with_xml_tool_calls(form, "", "");
+}
+
+static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
+ static const xml_tool_call_format form = ([]() {
+ xml_tool_call_format form {};
+ form.scope_start = "";
+ form.tool_start = "", "");
+}
+
+static void common_chat_parse_apriel_1_5(common_chat_msg_parser & builder) {
+ static const xml_tool_call_format form = ([]() {
+ xml_tool_call_format form {};
+ form.scope_start = "[";
+ form.tool_start = "{\"name\": \"";
+ form.tool_sep = "\", \"arguments\": {";
+ form.key_start = "\"";
+ form.key_val_sep = "\": ";
+ form.val_end = ", ";
+ form.tool_end = "}, ";
+ form.scope_end = "]";
+ form.raw_argval = false;
+ form.last_val_end = "";
+ form.last_tool_end = "}";
+ return form;
+ })();
+ builder.consume_reasoning_with_xml_tool_calls(form, "", "");
+}
+
+static void common_chat_parse_xiaomi_mimo(common_chat_msg_parser & builder) {
+ static const xml_tool_call_format form = ([]() {
+ xml_tool_call_format form {};
+ form.scope_start = "";
+ form.tool_start = "\n{\"name\": \"";
+ form.tool_sep = "\", \"arguments\": {";
+ form.key_start = "\"";
+ form.key_val_sep = "\": ";
+ form.val_end = ", ";
+ form.tool_end = "}\n";
+ form.scope_end = "";
+ form.raw_argval = false;
+ form.last_val_end = "";
+ return form;
+ })();
+ builder.consume_reasoning_with_xml_tool_calls(form);
+}
+
+static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
+ static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
+ static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
+
+ static const common_regex start_regex("<\\|start\\|>assistant");
+ static const common_regex analysis_regex("<\\|channel\\|>analysis");
+ static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
+ static const common_regex preamble_regex("<\\|channel\\|>commentary");
+ static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
+ static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
+
+ auto consume_end = [&](bool include_end = false) {
+ if (auto res = builder.try_find_literal("<|end|>")) {
+ return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
+ }
+ return builder.consume_rest();
+ };
+
+ auto handle_tool_call = [&](const std::string & name) {
+ if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
+ if (builder.syntax().parse_tool_calls) {
+ if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ } else if (args->is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ }
+ };
+
+ auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional {
+ auto match = regex.search(input, 0, true);
+ if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
+ return match;
+ }
+ return std::nullopt;
+ };
+
+ do {
+ auto header_start_pos = builder.pos();
+ auto content_start = builder.try_find_literal("<|message|>");
+ if (!content_start) {
+ throw common_chat_msg_partial_exception("incomplete header");
+ }
+
+ auto header = content_start->prelude;
+
+ if (auto match = regex_match(tool_call1_regex, header)) {
+ auto group = match->groups[1];
+ auto name = header.substr(group.begin, group.end - group.begin);
+ handle_tool_call(name);
+ continue;
+ }
+
+ if (auto match = regex_match(tool_call2_regex, header)) {
+ auto group = match->groups[2];
+ auto name = header.substr(group.begin, group.end - group.begin);
+ handle_tool_call(name);
+ continue;
+ }
+
+ if (regex_match(analysis_regex, header)) {
+ builder.move_to(header_start_pos);
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
+ builder.add_content(consume_end(true));
+ } else {
+ builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
+ }
+ continue;
+ }
+
+ if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
+ builder.add_content(consume_end());
+ continue;
+ }
+
+ // Possibly a malformed message, attempt to recover by rolling
+ // back to pick up the next <|start|>
+ LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
+ builder.move_to(header_start_pos);
+ } while (builder.try_find_regex(start_regex, std::string::npos, false));
+
+ auto remaining = builder.consume_rest();
+ if (!remaining.empty()) {
+ LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
+ }
+}
+
+static void common_chat_parse_glm_4_5(common_chat_msg_parser & builder) {
+ static const xml_tool_call_format form {
+ /* form.scope_start = */ "",
+ /* form.tool_start = */ "",
+ /* form.tool_sep = */ "",
+ /* form.key_start = */ "",
+ /* form.key_val_sep = */ "",
+ /* form.val_end = */ "",
+ /* form.tool_end = */ "",
+ /* form.scope_end = */ "",
+ /* form.key_val_sep2 = */ "",
+ };
+ builder.consume_reasoning_with_xml_tool_calls(form, "", "");
+}
+
+static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) {
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+ static const common_regex prefix(regex_escape(" functools["));
+ parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1);
+}
+
+static void common_chat_parse_functionary_v3_2(common_chat_msg_parser & builder) {
+ static const common_regex function_regex_start_only(R"((\w+\n\{|python\n|all\n))");
+ static const common_regex function_regex(R"(>>>(\w+\n\{|python\n|all\n))");
+ static const common_regex close_regex(R"(\s*)");
+
+ parse_json_tool_calls(
+ builder,
+ std::nullopt,
+ function_regex_start_only,
+ function_regex,
+ close_regex,
+ std::nullopt,
+ /* allow_raw_python= */ true,
+ /* get_function_name= */ [&](const auto & res) -> std::string {
+ auto at_start = res.groups[0].begin == 0;
+ auto name = builder.str(res.groups[1]);
+ if (!name.empty() && name.back() == '{') {
+ // Unconsume the opening brace '{' to ensure the JSON parsing goes well.
+ builder.move_back(1);
+ }
+ auto idx = name.find_last_not_of("\n{");
+ name = name.substr(0, idx + 1);
+ if (at_start && name == "all") {
+ return "";
+ }
+ return name;
+ });
+}
+
+static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser & builder) {
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+ // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
+ static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));
+
+ static const common_regex function_regex(R"()");
+ static const common_regex close_regex(R"()");
+
+ parse_json_tool_calls(
+ builder,
+ /* block_open= */ std::nullopt,
+ /* function_regex_start_only= */ std::nullopt,
+ function_regex,
+ close_regex,
+ std::nullopt);
+
+ if (auto res = builder.try_find_regex(python_tag_regex)) {
+ auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
+ builder.add_tool_call("python", "", arguments);
+ return;
+ }
+}
+
+static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("", "");
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ static const common_regex open_regex(
+ "(?:"
+ "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
+ "(" // match 2 (open_tag)
+ ""
+ "|"
+ "|"
+ "|"
+ "|"
+ "|"
+ "|"
+ "|"
+ ")?"
+ "(\\s*\\{\\s*\"name\")" // match 3 (named tool call)
+ ")"
+ "|]+)>" // match 4 (function name)
+ "|" // match 5 (function name again)
+ );
+
+ while (auto res = builder.try_find_regex(open_regex)) {
+ const auto & block_start = res->groups[1];
+ std::string block_end = block_start.empty() ? "" : "```";
+
+ const auto & open_tag = res->groups[2];
+ std::string close_tag;
+
+ if (!res->groups[3].empty()) {
+ builder.move_to(res->groups[3].begin);
+ close_tag = open_tag.empty() ? "" : "" + builder.str(open_tag).substr(1);
+
+ if (auto tool_call = builder.try_consume_json_with_dumped_args({{"arguments"}})) {
+ if (!builder.add_tool_call(tool_call->value) || tool_call->is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ builder.consume_spaces();
+ builder.consume_literal(close_tag);
+ builder.consume_spaces();
+ if (!block_end.empty()) {
+ builder.consume_literal(block_end);
+ builder.consume_spaces();
+ }
+ } else {
+ throw common_chat_msg_partial_exception("failed to parse tool call");
+ }
+ } else {
+ auto function_name = builder.str(res->groups[4]);
+ if (function_name.empty()) {
+ function_name = builder.str(res->groups[5]);
+ }
+ GGML_ASSERT(!function_name.empty());
+
+ close_tag = "";
+
+ if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
+ if (!builder.add_tool_call(function_name, "", arguments->value) || arguments->is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ builder.consume_spaces();
+ builder.consume_literal(close_tag);
+ builder.consume_spaces();
+ if (!block_end.empty()) {
+ builder.consume_literal(block_end);
+ builder.consume_spaces();
+ }
+ }
+ }
+ }
+
+ builder.add_content(builder.consume_rest());
+}
+
+static void common_chat_parse_granite(common_chat_msg_parser & builder) {
+ // Parse thinking tags
+ static const common_regex start_think_regex(regex_escape(""));
+ static const common_regex end_think_regex(regex_escape(""));
+ // Granite models output partial tokens such as "<" and "groups[0].begin);
+ builder.try_find_regex(end_think_regex, std::string::npos, false);
+ // Restore position for try_parse_reasoning()
+ builder.move_to(res->groups[0].begin);
+ }
+ builder.try_parse_reasoning("", "");
+
+ // Parse response tags
+ static const common_regex start_response_regex(regex_escape(""));
+ static const common_regex end_response_regex(regex_escape(""));
+ // Granite models output partial tokens such as "<" and ""));
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
+ builder.move_to(res->groups[0].end);
+
+ // Expect JSON array of tool calls
+ if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
+ if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
+ throw common_chat_msg_partial_exception("incomplete tool call");
+ }
+ }
+ } else {
+ builder.add_content(builder.consume_rest());
+ }
+}
+
+static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
+ // Parse thinking tags
+ builder.try_parse_reasoning("", "");
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ // Look for tool calls
+ static const common_regex tool_call_regex(regex_escape(""));
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
+ builder.move_to(res->groups[0].end);
+
+ // Expect JSON array of tool calls
+ auto tool_calls_data = builder.consume_json();
+ if (tool_calls_data.json.is_array()) {
+ if (!builder.try_consume_literal("")) {
+ throw common_chat_msg_partial_exception("Incomplete tool call");
+ }
+ builder.add_tool_calls(tool_calls_data.json);
+ } else {
+ throw common_chat_msg_partial_exception("Incomplete tool call");
+ }
+ }
+ builder.add_content(builder.consume_rest());
+}
+
+static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
+ // Parse thinking tags
+ builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ // Look for tool calls
+ static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
+ builder.move_to(res->groups[0].end);
+
+ auto tool_calls_data = builder.consume_json();
+ if (tool_calls_data.json.is_array()) {
+ builder.consume_spaces();
+ if (!builder.try_consume_literal("<|tools_suffix|>")) {
+ throw common_chat_msg_partial_exception("Incomplete tool call");
+ }
+ for (const auto & value : tool_calls_data.json) {
+ if (value.is_object()) {
+ builder.add_tool_call_short_form(value);
+ }
+ }
+ } else {
+ throw common_chat_msg_partial_exception("Incomplete tool call");
+ }
+ }
+ builder.add_content(builder.consume_rest());
+}
+
+
+static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
+ if (!builder.syntax().parse_tool_calls) {
+ builder.add_content(builder.consume_rest());
+ return;
+ }
+
+ // LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
+ static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
+ static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
+
+ // Loop through all tool calls
+ while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
+ builder.move_to(res->groups[0].end);
+
+ // Parse JSON array format: [{"name": "...", "arguments": {...}}]
+ auto tool_calls_data = builder.consume_json();
+
+ // Consume end marker
+ builder.consume_spaces();
+ if (!builder.try_consume_regex(tool_call_end_regex)) {
+ throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
+ }
+
+ // Process each tool call in the array
+ if (tool_calls_data.json.is_array()) {
+ for (const auto & tool_call : tool_calls_data.json) {
+ if (!tool_call.is_object()) {
+ throw common_chat_msg_partial_exception("Tool call must be an object");
+ }
+
+ if (!tool_call.contains("name")) {
+ throw common_chat_msg_partial_exception("Tool call missing 'name' field");
+ }
+
+ std::string function_name = tool_call.at("name");
+ std::string arguments = "{}";
+
+ if (tool_call.contains("arguments")) {
+ if (tool_call.at("arguments").is_object()) {
+ arguments = tool_call.at("arguments").dump();
+ } else if (tool_call.at("arguments").is_string()) {
+ arguments = tool_call.at("arguments");
+ }
+ }
+
+ if (!builder.add_tool_call(function_name, "", arguments)) {
+ throw common_chat_msg_partial_exception("Incomplete tool call");
+ }
+ }
+ } else {
+ throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
+ }
+
+ // Consume any trailing whitespace after this tool call
+ builder.consume_spaces();
+ }
+
+ // Consume any remaining content after all tool calls
+ auto remaining = builder.consume_rest();
+ if (!string_strip(remaining).empty()) {
+ builder.add_content(remaining);
+ }
+}
+
+static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
+ static const xml_tool_call_format form {
+ /* form.scope_start = */ "",
+ /* form.tool_start = */ "",
+ /* form.key_start = */ "",
+ /* form.val_end = */ "",
+ /* form.tool_end = */ "",
+ /* form.scope_end = */ "",
+ };
+ builder.consume_reasoning_with_xml_tool_calls(form, "", "");
+}
+
+static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
+
+ // TODO: Tool calling
+
+ builder.add_content(builder.consume_rest());
+}
+
+static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("", "");
+ builder.add_content(builder.consume_rest());
+}
+
+static void common_chat_parse(common_chat_msg_parser & builder) {
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str());
+
+ switch (builder.syntax().format) {
+ case COMMON_CHAT_FORMAT_CONTENT_ONLY:
+ common_chat_parse_content_only(builder);
+ break;
+ case COMMON_CHAT_FORMAT_GENERIC:
+ common_chat_parse_generic(builder);
+ break;
+ case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
+ common_chat_parse_mistral_nemo(builder);
+ break;
+ case COMMON_CHAT_FORMAT_MAGISTRAL:
+ common_chat_parse_magistral(builder);
+ break;
+ case COMMON_CHAT_FORMAT_LLAMA_3_X:
+ common_chat_parse_llama_3_1(builder);
+ break;
+ case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
+ common_chat_parse_llama_3_1(builder, /* with_builtin_tools= */ true);
+ break;
+ case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
+ common_chat_parse_deepseek_r1(builder);
+ break;
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
+ common_chat_parse_deepseek_v3_1(builder);
+ break;
+ case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
+ common_chat_parse_functionary_v3_2(builder);
+ break;
+ case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
+ common_chat_parse_functionary_v3_1_llama_3_1(builder);
+ break;
+ case COMMON_CHAT_FORMAT_HERMES_2_PRO:
+ common_chat_parse_hermes_2_pro(builder);
+ break;
+ case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
+ common_chat_parse_firefunction_v2(builder);
+ break;
+ case COMMON_CHAT_FORMAT_COMMAND_R7B:
+ common_chat_parse_command_r7b(builder);
+ break;
+ case COMMON_CHAT_FORMAT_GRANITE:
+ common_chat_parse_granite(builder);
+ break;
+ case COMMON_CHAT_FORMAT_GPT_OSS:
+ common_chat_parse_gpt_oss(builder);
+ break;
+ case COMMON_CHAT_FORMAT_SEED_OSS:
+ common_chat_parse_seed_oss(builder);
+ break;
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2:
+ common_chat_parse_nemotron_v2(builder);
+ break;
+ case COMMON_CHAT_FORMAT_APERTUS:
+ common_chat_parse_apertus(builder);
+ break;
+ case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
+ common_chat_parse_lfm2(builder);
+ break;
+ case COMMON_CHAT_FORMAT_MINIMAX_M2:
+ common_chat_parse_minimax_m2(builder);
+ break;
+ case COMMON_CHAT_FORMAT_GLM_4_5:
+ common_chat_parse_glm_4_5(builder);
+ break;
+ case COMMON_CHAT_FORMAT_KIMI_K2:
+ common_chat_parse_kimi_k2(builder);
+ break;
+ case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
+ common_chat_parse_qwen3_coder_xml(builder);
+ break;
+ case COMMON_CHAT_FORMAT_APRIEL_1_5:
+ common_chat_parse_apriel_1_5(builder);
+ break;
+ case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
+ common_chat_parse_xiaomi_mimo(builder);
+ break;
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN:
+ common_chat_parse_solar_open(builder);
+ break;
+ default:
+ throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
+ }
+ builder.finish();
+}
+
+common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
+ syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
+ syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+ return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
+ }
+ common_chat_msg_parser builder(input, is_partial, syntax);
+ try {
+ common_chat_parse(builder);
+ } catch (const common_chat_msg_partial_exception & ex) {
+ LOG_DBG("Partial parse: %s\n", ex.what());
+ if (!is_partial) {
+ builder.clear_tools();
+ builder.move_to(0);
+ common_chat_parse_content_only(builder);
+ }
+ }
+ auto msg = builder.result();
+ if (!is_partial) {
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ }
+ return msg;
+}
+
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+ if (parser.empty()) {
+ throw std::runtime_error("Failed to parse due to missing parser definition.");
+ }
+
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
+
+ common_peg_parse_context ctx(input, is_partial);
+ auto result = parser.parse(ctx);
+ if (result.fail()) {
+ throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
+ }
+
+ common_chat_msg msg;
+ msg.role = "assistant";
+
+ if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
+ auto mapper = common_chat_peg_native_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+ } else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+ auto mapper = common_chat_peg_constructed_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+ } else {
+ // Generic mapper
+ auto mapper = common_chat_peg_mapper(msg);
+ mapper.from_ast(ctx.ast, result);
+ }
+ if (!is_partial) {
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ }
+ return msg;
+}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser.h b/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser.h
new file mode 100644
index 0000000..78c4b74
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/chat-parser.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include "chat.h"
+#include "chat-parser-xml-toolcall.h"
+#include "json-partial.h"
+#include "regex-partial.h"
+
+#include
+
+#include
+#include
+#include
+
+class common_chat_msg_partial_exception : public std::runtime_error {
+ public:
+ common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+class common_chat_msg_parser {
+ std::string input_;
+ bool is_partial_;
+ common_chat_syntax syntax_;
+ std::string healing_marker_;
+
+ size_t pos_ = 0;
+ common_chat_msg result_;
+
+ public:
+ common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+ const std::string & input() const { return input_; }
+ size_t pos() const { return pos_; }
+ const std::string & healing_marker() const { return healing_marker_; }
+ const bool & is_partial() const { return is_partial_; }
+ const common_chat_msg & result() const { return result_; }
+ const common_chat_syntax & syntax() const { return syntax_; }
+
+ void move_to(size_t pos) {
+ if (pos > input_.size()) {
+ throw std::runtime_error("Invalid position!");
+ }
+ pos_ = pos;
+ }
+ void move_back(size_t n) {
+ if (pos_ < n) {
+ throw std::runtime_error("Can't move back that far!");
+ }
+ pos_ -= n;
+ }
+
+ // Get the substring of the input at the given range
+ std::string str(const common_string_range & rng) const;
+
+ // Appends to the result.content field
+ void add_content(const std::string & content);
+
+ // Appends to the result.reasoning_content field
+ void add_reasoning_content(const std::string & reasoning_content);
+
+ // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
+ bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
+
+ // Adds a tool call using the "name", "id" and "arguments" fields of the json object
+ bool add_tool_call(const nlohmann::ordered_json & tool_call);
+
+ // Adds an array of tool calls using their "name", "id" and "arguments" fields.
+ bool add_tool_calls(const nlohmann::ordered_json & arr);
+
+ // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
+ bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
+
+ void finish();
+
+ bool consume_spaces();
+
+ void consume_literal(const std::string & literal);
+
+ bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
+
+ std::string consume_rest();
+
+ struct find_regex_result {
+ std::string prelude;
+ std::vector groups;
+ };
+
+ std::optional try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
+
+ bool try_consume_literal(const std::string & literal);
+
+ std::optional try_find_literal(const std::string & literal);
+
+ find_regex_result consume_regex(const common_regex & regex);
+
+ std::optional try_consume_regex(const common_regex & regex);
+
+ std::optional try_consume_json();
+ common_json consume_json();
+
+ struct consume_json_result {
+ nlohmann::ordered_json value;
+ bool is_partial;
+ };
+
+ /*
+ Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
+
+ By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
+ e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
+
+ But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
+ - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
+ - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
+ */
+ consume_json_result consume_json_with_dumped_args(
+ const std::vector> & args_paths = {},
+ const std::vector> & content_paths = {}
+ );
+ std::optional try_consume_json_with_dumped_args(
+ const std::vector> & args_paths = {},
+ const std::vector> & content_paths = {}
+ );
+
+ /**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+ bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
+
+ // Parse content uses reasoning and XML-Style tool call
+ void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "", const std::string & end_think = "");
+
+ void clear_tools();
+};
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/chat-peg-parser.cpp b/patches/llama-cpp-sys-2/llama.cpp/common/chat-peg-parser.cpp
new file mode 100644
index 0000000..1bcba9c
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/chat-peg-parser.cpp
@@ -0,0 +1,124 @@
+#include "chat-peg-parser.h"
+
+#include
+
+using json = nlohmann::json;
+
+static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
+ int count = 0;
+ while (!sv.empty() && std::isspace(static_cast(sv.back()))) {
+ if (max != -1 && count <= max) {
+ break;
+ }
+ sv.remove_suffix(1);
+ count++;
+ }
+ return sv;
+}
+
+void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+ arena.visit(result, [this](const common_peg_ast_node & node) {
+ map(node);
+ });
+}
+
+void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
+ bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
+ bool is_content = node.tag == common_chat_peg_builder::CONTENT;
+
+ if (is_reasoning) {
+ result.reasoning_content = std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_content) {
+ result.content = std::string(trim_trailing_space(node.text));
+ }
+}
+
+void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
+ common_chat_peg_mapper::map(node);
+
+ bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
+ bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
+ bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
+ bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
+
+ if (is_tool_open) {
+ result.tool_calls.emplace_back();
+ current_tool = &result.tool_calls.back();
+ }
+
+ if (is_tool_id && current_tool) {
+ current_tool->id = std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_tool_name && current_tool) {
+ current_tool->name = std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_tool_args && current_tool) {
+ current_tool->arguments = std::string(trim_trailing_space(node.text));
+ }
+}
+
+void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
+ common_chat_peg_mapper::map(node);
+
+ bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
+ bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
+ bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
+ bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
+ bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
+ bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
+ bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
+ bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
+
+ if (is_tool_open) {
+ result.tool_calls.emplace_back();
+ current_tool = &result.tool_calls.back();
+ arg_count = 0;
+ }
+
+ if (is_tool_name) {
+ current_tool->name = std::string(node.text);
+ current_tool->arguments = "{";
+ }
+
+ if (is_arg_open) {
+ needs_closing_quote = false;
+ }
+
+ if (is_arg_name && current_tool) {
+ if (arg_count > 0) {
+ current_tool->arguments += ",";
+ }
+ current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
+ ++arg_count;
+ }
+
+ if (is_arg_string && current_tool) {
+ // Serialize to JSON, but exclude the end quote
+ std::string dumped = json(trim_trailing_space(node.text)).dump();
+ current_tool->arguments += dumped.substr(0, dumped.size() - 1);
+ needs_closing_quote = true;
+ }
+
+ if (is_arg_close && current_tool) {
+ if (needs_closing_quote) {
+ current_tool->arguments += "\"";
+ needs_closing_quote = false;
+ }
+ }
+
+ if (is_arg_json && current_tool) {
+ current_tool->arguments += std::string(trim_trailing_space(node.text));
+ }
+
+ if (is_tool_close && current_tool) {
+ if (needs_closing_quote) {
+ current_tool->arguments += "\"";
+ needs_closing_quote = false;
+ }
+ current_tool->arguments += "}";
+ }
+}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/chat-peg-parser.h b/patches/llama-cpp-sys-2/llama.cpp/common/chat-peg-parser.h
new file mode 100644
index 0000000..b84cbed
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/chat-peg-parser.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include "chat.h"
+#include "peg-parser.h"
+
+class common_chat_peg_builder : public common_peg_parser_builder {
+ public:
+ static constexpr const char * REASONING_BLOCK = "reasoning-block";
+ static constexpr const char * REASONING = "reasoning";
+ static constexpr const char * CONTENT = "content";
+
+ common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
+ common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
+ common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
+};
+
+inline common_peg_arena build_chat_peg_parser(const std::function & fn) {
+ common_chat_peg_builder builder;
+ builder.set_root(fn(builder));
+ return builder.build();
+}
+
+class common_chat_peg_mapper {
+ public:
+ common_chat_msg & result;
+
+ common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
+
+ virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
+ virtual void map(const common_peg_ast_node & node);
+};
+
+class common_chat_peg_native_builder : public common_chat_peg_builder {
+ public:
+ static constexpr const char * TOOL = "tool";
+ static constexpr const char * TOOL_OPEN = "tool-open";
+ static constexpr const char * TOOL_CLOSE = "tool-close";
+ static constexpr const char * TOOL_ID = "tool-id";
+ static constexpr const char * TOOL_NAME = "tool-name";
+ static constexpr const char * TOOL_ARGS = "tool-args";
+
+ common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+ common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+ common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+ common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
+ common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+ common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
+};
+
+class common_chat_peg_native_mapper : public common_chat_peg_mapper {
+ common_chat_tool_call * current_tool;
+
+ public:
+ common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+ void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_native_parser(const std::function & fn) {
+ common_chat_peg_native_builder builder;
+ builder.set_root(fn(builder));
+ return builder.build();
+}
+
+class common_chat_peg_constructed_builder : public common_chat_peg_builder {
+ public:
+ static constexpr const char * TOOL = "tool";
+ static constexpr const char * TOOL_OPEN = "tool-open";
+ static constexpr const char * TOOL_CLOSE = "tool-close";
+ static constexpr const char * TOOL_NAME = "tool-name";
+ static constexpr const char * TOOL_ARG = "tool-arg";
+ static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
+ static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
+ static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
+ static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
+ static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
+
+ common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+ common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+ common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+ common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+ common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
+ common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
+ common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
+ common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
+ common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
+ common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
+};
+
+class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
+ common_chat_tool_call * current_tool;
+ int arg_count = 0;
+ bool needs_closing_quote = false;
+
+ public:
+ common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+ void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_constructed_parser(const std::function & fn) {
+ common_chat_peg_constructed_builder builder;
+ builder.set_root(fn(builder));
+ return builder.build();
+}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/chat.cpp b/patches/llama-cpp-sys-2/llama.cpp/common/chat.cpp
new file mode 100644
index 0000000..22e527b
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/chat.cpp
@@ -0,0 +1,2899 @@
+#include "chat.h"
+#include "chat-parser.h"
+#include "chat-peg-parser.h"
+#include "common.h"
+#include "json-partial.h"
+#include "json-schema-to-grammar.h"
+#include "log.h"
+#include "regex-partial.h"
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using json = nlohmann::ordered_json;
+
+static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
+ auto time = std::chrono::system_clock::to_time_t(now);
+ auto local_time = *std::localtime(&time);
+ std::ostringstream ss;
+ ss << std::put_time(&local_time, format.c_str());
+ auto res = ss.str();
+ return res;
+}
+
+static std::string string_diff(const std::string & last, const std::string & current) {
+ if (last.empty()) {
+ return current;
+ }
+ if (!string_starts_with(current, last)) {
+ if (string_starts_with(last, current)) {
+ // This happens if the last generation ended on a partial stop word (not erased),
+ // and the current ended on a stop word (erased).
+ return "";
+ }
+ throw std::runtime_error("Invalid diff: '" + last + "' not found at start of '" + current + "'");
+ }
+ return current.substr(last.size());
+}
+
+static bool has_content_or_tool_calls(const common_chat_msg & msg) {
+ return !msg.content.empty() || !msg.tool_calls.empty();
+}
+
+template <>
+json common_chat_msg::to_json_oaicompat() const
+{
+ json message {
+ {"role", "assistant"},
+ };
+ if (!reasoning_content.empty()) {
+ message["reasoning_content"] = reasoning_content;
+ }
+ if (content.empty() && !tool_calls.empty()) {
+ message["content"] = json();
+ } else {
+ message["content"] = content;
+ }
+ if (!tool_calls.empty()) {
+ auto arr = json::array();
+ for (const auto & tc : tool_calls) {
+ arr.push_back({
+ {"type", "function"},
+ {"function", {
+ {"name", tc.name},
+ {"arguments", tc.arguments},
+ }},
+ {"id", tc.id},
+ // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
+ // // We only generate a random id for the ones that don't generate one by themselves
+ // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
+ // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
+ });
+ }
+ message["tool_calls"] = arr;
+ }
+ return message;
+}
+
+std::vector common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) {
+ std::vector diffs;
+ if (msg_new.tool_calls.size() > msg_prv.tool_calls.size()) {
+ diffs.reserve(msg_new.tool_calls.size() - msg_prv.tool_calls.size() + 3);
+ } else {
+ diffs.reserve(3);
+ }
+
+ // TODO: these can become expensive for long messages - how to optimize?
+ if (msg_prv.reasoning_content != msg_new.reasoning_content) {
+ auto & diff = diffs.emplace_back();
+ diff.reasoning_content_delta = string_diff(msg_prv.reasoning_content, msg_new.reasoning_content);
+ }
+ if (msg_prv.content != msg_new.content) {
+ auto & diff = diffs.emplace_back();
+ diff.content_delta = string_diff(msg_prv.content, msg_new.content);
+ }
+
+ if (msg_new.tool_calls.size() < msg_prv.tool_calls.size()) {
+ throw std::runtime_error("Invalid diff: now finding less tool calls!");
+ }
+
+ if (!msg_prv.tool_calls.empty()) {
+ const auto idx = msg_prv.tool_calls.size() - 1;
+ const auto & pref = msg_prv.tool_calls[idx];
+ const auto & newf = msg_new.tool_calls[idx];
+ if (pref.name != newf.name) {
+ throw std::runtime_error("Invalid diff: tool call mismatch!");
+ }
+ const auto args_diff = string_diff(pref.arguments, newf.arguments);
+ if (!args_diff.empty() || pref.id != newf.id) {
+ auto & diff = diffs.emplace_back();
+ diff.tool_call_index = idx;
+ if (pref.id != newf.id) {
+ diff.tool_call_delta.id = newf.id;
+ diff.tool_call_delta.name = newf.name;
+ }
+ diff.tool_call_delta.arguments = args_diff;
+ }
+ }
+ for (size_t idx = msg_prv.tool_calls.size(); idx < msg_new.tool_calls.size(); ++idx) {
+ auto & diff = diffs.emplace_back();
+ diff.tool_call_index = idx;
+ diff.tool_call_delta = msg_new.tool_calls[idx];
+ }
+
+ return diffs;
+}
+
+typedef minja::chat_template common_chat_template;
+
+struct common_chat_templates {
+ bool add_bos;
+ bool add_eos;
+ bool has_explicit_template; // Model had builtin template or template overridde was specified.
+ std::unique_ptr template_default; // always set (defaults to chatml)
+ std::unique_ptr template_tool_use;
+};
+
+struct templates_params {
+ json messages;
+ json tools;
+ common_chat_tool_choice tool_choice;
+ json json_schema;
+ bool parallel_tool_calls;
+ common_reasoning_format reasoning_format;
+ bool stream;
+ std::string grammar;
+ bool add_generation_prompt = true;
+ bool enable_thinking = true;
+ std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+ json extra_context;
+ bool add_bos;
+ bool add_eos;
+ bool is_inference = true;
+};
+
+common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
+ if (tool_choice == "auto") {
+ return COMMON_CHAT_TOOL_CHOICE_AUTO;
+ }
+ if (tool_choice == "none") {
+ return COMMON_CHAT_TOOL_CHOICE_NONE;
+ }
+ if (tool_choice == "required") {
+ return COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ }
+ throw std::invalid_argument("Invalid tool_choice: " + tool_choice);
+}
+
+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
+ common_chat_templates_inputs dummy_inputs;
+ common_chat_msg msg;
+ msg.role = "user";
+ msg.content = "test";
+ dummy_inputs.messages = {msg};
+ dummy_inputs.enable_thinking = false;
+ const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+ dummy_inputs.enable_thinking = true;
+ const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+ return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
+}
+
+template <>
+std::vector common_chat_msgs_parse_oaicompat(const json & messages) {
+ std::vector msgs;
+
+ try {
+
+ if (!messages.is_array()) {
+ throw std::invalid_argument("Expected 'messages' to be an array, got " + messages.dump());
+ }
+
+ for (const auto & message : messages) {
+ if (!message.is_object()) {
+ throw std::invalid_argument("Expected 'message' to be an object, got " + message.dump());
+ }
+
+ common_chat_msg msg;
+ if (!message.contains("role")) {
+ throw std::invalid_argument("Missing 'role' in message: " + message.dump());
+ }
+ msg.role = message.at("role");
+
+ auto has_content = message.contains("content");
+ auto has_tool_calls = message.contains("tool_calls");
+ if (has_content) {
+ const auto & content = message.at("content");
+ if (content.is_string()) {
+ msg.content = content;
+ } else if (content.is_array()) {
+ for (const auto & part : content) {
+ if (!part.contains("type")) {
+ throw std::invalid_argument("Missing content part type: " + part.dump());
+ }
+ const auto & type = part.at("type");
+ if (type != "text") {
+ throw std::invalid_argument("Unsupported content part type: " + type.dump());
+ }
+ common_chat_msg_content_part msg_part;
+ msg_part.type = type;
+ msg_part.text = part.at("text");
+ msg.content_parts.push_back(msg_part);
+ }
+ } else if (!content.is_null()) {
+ throw std::invalid_argument("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
+ }
+ }
+ if (has_tool_calls) {
+ for (const auto & tool_call : message.at("tool_calls")) {
+ common_chat_tool_call tc;
+ if (!tool_call.contains("type")) {
+ throw std::invalid_argument("Missing tool call type: " + tool_call.dump());
+ }
+ const auto & type = tool_call.at("type");
+ if (type != "function") {
+ throw std::invalid_argument("Unsupported tool call type: " + tool_call.dump());
+ }
+ if (!tool_call.contains("function")) {
+ throw std::invalid_argument("Missing tool call function: " + tool_call.dump());
+ }
+ const auto & fc = tool_call.at("function");
+ if (!fc.contains("name")) {
+ throw std::invalid_argument("Missing tool call name: " + tool_call.dump());
+ }
+ tc.name = fc.at("name");
+ tc.arguments = fc.at("arguments");
+ if (tool_call.contains("id")) {
+ tc.id = tool_call.at("id");
+ }
+ msg.tool_calls.push_back(tc);
+ }
+ }
+ if (!has_content && !has_tool_calls) {
+ throw std::invalid_argument("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
+ }
+ if (message.contains("reasoning_content")) {
+ msg.reasoning_content = message.at("reasoning_content");
+ }
+ if (message.contains("name")) {
+ msg.tool_name = message.at("name");
+ }
+ if (message.contains("tool_call_id")) {
+ msg.tool_call_id = message.at("tool_call_id");
+ }
+
+ msgs.push_back(msg);
+ }
+ } catch (const std::exception & e) {
+ // @ngxson : disable otherwise it's bloating the API response
+ // printf("%s\n", std::string("; messages = ") + messages.dump(2));
+ throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
+ }
+
+ return msgs;
+}
+
+template <>
+json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) {
+ json messages = json::array();
+ for (const auto & msg : msgs) {
+ if (!msg.content.empty() && !msg.content_parts.empty()) {
+ throw std::runtime_error("Cannot specify both content and content_parts");
+ }
+ json jmsg {
+ {"role", msg.role},
+ };
+ if (!msg.content.empty()) {
+ jmsg["content"] = msg.content;
+ } else if (!msg.content_parts.empty()) {
+ if (concat_typed_text) {
+ std::string text;
+ for (const auto & part : msg.content_parts) {
+ if (part.type != "text") {
+ LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
+ continue;
+ }
+ if (!text.empty()) {
+ text += '\n';
+ }
+ text += part.text;
+ }
+ jmsg["content"] = text;
+ } else {
+ auto & parts = jmsg["content"] = json::array();
+ for (const auto & part : msg.content_parts) {
+ parts.push_back({
+ {"type", part.type},
+ {"text", part.text},
+ });
+ }
+ }
+ } else {
+ jmsg["content"] = "";
+ }
+ if (!msg.reasoning_content.empty()) {
+ jmsg["reasoning_content"] = msg.reasoning_content;
+ }
+ if (!msg.tool_name.empty()) {
+ jmsg["name"] = msg.tool_name;
+ }
+ if (!msg.tool_call_id.empty()) {
+ jmsg["tool_call_id"] = msg.tool_call_id;
+ }
+ if (!msg.tool_calls.empty()) {
+ auto & tool_calls = jmsg["tool_calls"] = json::array();
+ for (const auto & tool_call : msg.tool_calls) {
+ json tc {
+ {"type", "function"},
+ {"function", {
+ {"name", tool_call.name},
+ {"arguments", tool_call.arguments},
+ }},
+ };
+ if (!tool_call.id.empty()) {
+ tc["id"] = tool_call.id;
+ }
+ tool_calls.push_back(tc);
+ }
+ }
+ messages.push_back(jmsg);
+ }
+ return messages;
+}
+
+template <>
+std::vector common_chat_msgs_parse_oaicompat(const std::string & messages) {
+ return common_chat_msgs_parse_oaicompat(json::parse(messages));
+}
+
+template <>
+std::vector common_chat_tools_parse_oaicompat(const json & tools) {
+ std::vector result;
+
+ try {
+ if (!tools.is_null()) {
+ if (!tools.is_array()) {
+ throw std::invalid_argument("Expected 'tools' to be an array, got " + tools.dump());
+ }
+ for (const auto & tool : tools) {
+ if (!tool.contains("type")) {
+ throw std::invalid_argument("Missing tool type: " + tool.dump());
+ }
+ const auto & type = tool.at("type");
+ if (!type.is_string() || type != "function") {
+ throw std::invalid_argument("Unsupported tool type: " + tool.dump());
+ }
+ if (!tool.contains("function")) {
+ throw std::invalid_argument("Missing tool function: " + tool.dump());
+ }
+
+ const auto & function = tool.at("function");
+ result.push_back({
+ /* .name = */ function.at("name"),
+ /* .description = */ function.value("description", ""),
+ /* .parameters = */ function.value("parameters", json::object()).dump(),
+ });
+ }
+ }
+ } catch (const std::exception & e) {
+ throw std::runtime_error("Failed to parse tools: " + std::string(e.what()) + "; tools = " + tools.dump(2));
+ }
+
+ return result;
+}
+
+template <>
+std::vector common_chat_tools_parse_oaicompat(const std::string & tools) {
+ return common_chat_tools_parse_oaicompat(json::parse(tools));
+}
+
+template <>
+json common_chat_tools_to_json_oaicompat(const std::vector & tools) {
+ if (tools.empty()) {
+ return json();
+ }
+
+ auto result = json::array();
+ for (const auto & tool : tools) {
+ result.push_back({
+ {"type", "function"},
+ {"function", {
+ {"name", tool.name},
+ {"description", tool.description},
+ {"parameters", json::parse(tool.parameters)},
+ }},
+ });
+ }
+ return result;
+}
+
+template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+ json delta = json::object();
+ if (!diff.reasoning_content_delta.empty()) {
+ delta["reasoning_content"] = diff.reasoning_content_delta;
+ }
+ if (!diff.content_delta.empty()) {
+ delta["content"] = diff.content_delta;
+ }
+ if (diff.tool_call_index != std::string::npos) {
+ json tool_call;
+ tool_call["index"] = diff.tool_call_index;
+ if (!diff.tool_call_delta.id.empty()) {
+ tool_call["id"] = diff.tool_call_delta.id;
+ tool_call["type"] = "function";
+ }
+ json function = json::object();
+ if (!diff.tool_call_delta.name.empty()) {
+ function["name"] = diff.tool_call_delta.name;
+ }
+ function["arguments"] = diff.tool_call_delta.arguments;
+ tool_call["function"] = function;
+ delta["tool_calls"] = json::array({tool_call});
+ }
+ return delta;
+}
+
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
+ if (use_jinja) {
+ try {
+ common_chat_msg msg;
+ msg.role = "user";
+ msg.content = "test";
+
+ auto tmpls = common_chat_templates_init(/* model= */ nullptr, tmpl);
+
+ common_chat_templates_inputs inputs;
+ inputs.messages = {msg};
+
+ common_chat_templates_apply(tmpls.get(), inputs);
+ return true;
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
+ return false;
+ }
+ }
+ llama_chat_message chat[] = {{"user", "test"}};
+ const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
+ return res >= 0;
+}
+
+std::string common_chat_format_single(
+ const struct common_chat_templates * tmpls,
+ const std::vector & past_msg,
+ const common_chat_msg & new_msg,
+ bool add_ass,
+ bool use_jinja) {
+
+ common_chat_templates_inputs inputs;
+ inputs.use_jinja = use_jinja;
+ inputs.add_bos = tmpls->add_bos;
+ inputs.add_eos = tmpls->add_eos;
+
+ std::string fmt_past_msg;
+ if (!past_msg.empty()) {
+ inputs.messages = past_msg;
+ inputs.add_generation_prompt = false;
+ fmt_past_msg = common_chat_templates_apply(tmpls, inputs).prompt;
+ }
+ std::ostringstream ss;
+ // if the past_msg ends with a newline, we must preserve it in the formatted version
+ if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
+ ss << "\n";
+ };
+ // format chat with new_msg
+ inputs.messages.push_back(new_msg);
+ inputs.add_generation_prompt = add_ass;
+ auto fmt_new_msg = common_chat_templates_apply(tmpls, inputs).prompt;
+ // get the diff part
+ ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+ return ss.str();
+}
+
+std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map & chat_template_kwargs) {
+ common_chat_templates_inputs inputs;
+ inputs.use_jinja = use_jinja;
+ inputs.add_bos = tmpls->add_bos;
+ inputs.add_eos = tmpls->add_eos;
+ inputs.chat_template_kwargs = chat_template_kwargs;
+ auto add_simple_msg = [&](auto role, auto content) {
+ common_chat_msg msg;
+ msg.role = role;
+ msg.content = content;
+ inputs.messages.push_back(msg);
+ };
+ add_simple_msg("system", "You are a helpful assistant");
+ add_simple_msg("user", "Hello");
+ add_simple_msg("assistant", "Hi there");
+ add_simple_msg("user", "How are you?");
+ return common_chat_templates_apply(tmpls, inputs).prompt;
+}
+
+#define CHATML_TEMPLATE_SRC \
+ "{%- for message in messages -%}\n" \
+ " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
+ "{%- endfor -%}\n" \
+ "{%- if add_generation_prompt -%}\n" \
+ " {{- '<|im_start|>assistant\n' -}}\n" \
+ "{%- endif -%}"
+
+void common_chat_templates_free(struct common_chat_templates * tmpls) {
+ delete tmpls;
+}
+
+bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls) {
+ return tmpls->has_explicit_template;
+}
+
+const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
+ if (variant != nullptr) {
+ if (strcmp(variant, "tool_use") == 0) {
+ if (tmpls->template_tool_use) {
+ return tmpls->template_tool_use->source().c_str();
+ }
+ return nullptr;
+ } else {
+ LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
+ }
+ }
+ return tmpls->template_default->source().c_str();
+}
+
+common_chat_templates_ptr common_chat_templates_init(
+ const struct llama_model * model,
+ const std::string & chat_template_override,
+ const std::string & bos_token_override,
+ const std::string & eos_token_override)
+{
+ std::string default_template_src;
+ std::string template_tool_use_src;
+
+ bool has_explicit_template = !chat_template_override.empty();
+ if (chat_template_override.empty()) {
+ GGML_ASSERT(model != nullptr);
+ const auto * str = llama_model_chat_template(model, /* name */ nullptr);
+ if (str) {
+ default_template_src = str;
+ has_explicit_template = true;
+ }
+ str = llama_model_chat_template(model, /* name */ "tool_use");
+ if (str) {
+ template_tool_use_src = str;
+ has_explicit_template = true;
+ }
+ } else {
+ default_template_src = chat_template_override;
+ }
+ if (default_template_src.empty() || default_template_src == "chatml") {
+ if (!template_tool_use_src.empty()) {
+ default_template_src = template_tool_use_src;
+ } else {
+ default_template_src = CHATML_TEMPLATE_SRC;
+ }
+ }
+
+ // TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
+ // Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
+ if (default_template_src.find("<|channel|>") != std::string::npos
+ // search for the error message and patch it
+ && default_template_src.find("in message.content or") != std::string::npos) {
+ string_replace_all(default_template_src,
+ "{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
+ "{%- if false %}");
+ }
+
+ // TODO @aldehir : this is a temporary fix, pending Minja changes
+ // Ref: https://github.com/ggml-org/llama.cpp/pull/17713#issuecomment-3631342664
+ if (default_template_src.find("[TOOL_CALLS]") != std::string::npos
+ // search for the error message and patch it
+ && default_template_src.find("if (message['content'] is none or") != std::string::npos) {
+ string_replace_all(default_template_src,
+ "{%- if (message['content'] is none or message['content'] == '' or message['content']|length == 0) and (message['tool_calls'] is not defined or message['tool_calls'] is none or message['tool_calls']|length == 0) %}",
+ "{%- if false %}");
+ }
+
+ std::string token_bos = bos_token_override;
+ std::string token_eos = eos_token_override;
+ bool add_bos = false;
+ bool add_eos = false;
+ if (model) {
+ const auto * vocab = llama_model_get_vocab(model);
+ const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
+ if (token == LLAMA_TOKEN_NULL) {
+ if (default_template_src.find(jinja_variable_name) != std::string::npos
+ || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
+ LOG_WRN("common_chat_templates_init: warning: vocab does not have a %s token, jinja template won't work as intended.\n", name);
+ }
+ return std::string();
+ }
+ return common_token_to_piece(vocab, token, true);
+ };
+ token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
+ token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
+ add_bos = llama_vocab_get_add_bos(vocab);
+ add_eos = llama_vocab_get_add_eos(vocab);
+ }
+ common_chat_templates_ptr tmpls(new common_chat_templates());
+ tmpls->has_explicit_template = has_explicit_template;
+ tmpls->add_bos = add_bos;
+ tmpls->add_eos = add_eos;
+ try {
+ tmpls->template_default = std::make_unique(default_template_src, token_bos, token_eos);
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: failed to parse chat template (defaulting to chatml): %s \n", __func__, e.what());
+ tmpls->template_default = std::make_unique(CHATML_TEMPLATE_SRC, token_bos, token_eos);
+ }
+ if (!template_tool_use_src.empty()) {
+ try {
+ tmpls->template_tool_use = std::make_unique(template_tool_use_src, token_bos, token_eos);
+ } catch (const std::exception & e) {
+ LOG_ERR("%s: failed to parse tool use chat template (ignoring it): %s\n", __func__, e.what());
+ }
+ }
+ return tmpls;
+}
+
+const char * common_chat_format_name(common_chat_format format) {
+ switch (format) {
+ case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
+ case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
+ case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
+ case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
+ case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
+ case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
+ case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
+ case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
+ case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
+ case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
+ case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
+ case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
+ case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
+ case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
+ case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
+ case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
+ case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
+ case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
+ case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
+ case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
+ case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
+ case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
+ case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
+ case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
+ case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
+ case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
+ default:
+ throw std::runtime_error("Unknown chat format");
+ }
+}
+
+const char * common_reasoning_format_name(common_reasoning_format format) {
+ switch (format) {
+ case COMMON_REASONING_FORMAT_NONE: return "none";
+ case COMMON_REASONING_FORMAT_AUTO: return "auto";
+ case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
+ case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
+ default:
+ throw std::runtime_error("Unknown reasoning format");
+ }
+}
+
+common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
+ if (format == "none") {
+ return COMMON_REASONING_FORMAT_NONE;
+ } else if (format == "auto") {
+ return COMMON_REASONING_FORMAT_AUTO;
+ } else if (format == "deepseek") {
+ return COMMON_REASONING_FORMAT_DEEPSEEK;
+ } else if (format == "deepseek-legacy") {
+ return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
+ }
+ throw std::runtime_error("Unknown reasoning format: " + format);
+}
+
+static void foreach_function(const json & tools, const std::function & fn) {
+ for (const auto & tool : tools) {
+ if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
+ LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
+ continue;
+ }
+ fn(tool);
+ }
+}
+
+static void foreach_parameter(const json & function, const std::function & fn) {
+ if (!function.contains("parameters") || !function.at("parameters").is_object()) {
+ return;
+ }
+ const auto & params = function.at("parameters");
+ if (!params.contains("properties") || !params.at("properties").is_object()) {
+ return;
+ }
+ const auto & props = params.at("properties");
+ std::set required;
+ if (params.contains("required") && params.at("required").is_array()) {
+ params.at("required").get_to(required);
+ }
+ for (const auto & [name, prop] : props.items()) {
+ bool is_required = (required.find(name) != required.end());
+ fn(name, prop, is_required);
+ }
+}
+
+static std::string apply(
+ const common_chat_template & tmpl,
+ const struct templates_params & inputs,
+ const std::optional & messages_override = std::nullopt,
+ const std::optional & tools_override = std::nullopt,
+ const std::optional & additional_context = std::nullopt)
+{
+ minja::chat_template_inputs tmpl_inputs;
+ tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
+ if (tools_override) {
+ tmpl_inputs.tools = *tools_override;
+ } else {
+ tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
+ }
+ tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
+ tmpl_inputs.extra_context = inputs.extra_context;
+ tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
+ if (additional_context) {
+ tmpl_inputs.extra_context.merge_patch(*additional_context);
+ }
+ // TODO: add flag to control date/time, if only for testing purposes.
+ // tmpl_inputs.now = std::chrono::system_clock::now();
+
+ minja::chat_template_options tmpl_opts;
+ // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
+ // instead of using `chat_template_options.use_bos_token = false`, since these tokens
+ // may be needed inside the template / between messages too.
+ auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
+ if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
+ result = result.substr(tmpl.bos_token().size());
+ }
+ if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
+ result = result.substr(0, result.size() - tmpl.eos_token().size());
+ }
+ return result;
+}
+
+static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ auto tool_call_schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ auto tool_schema = json {
+ {"type", "object"},
+ {"properties", {
+ {"name", {
+ {"type", "string"},
+ {"const", function.at("name")},
+ }},
+ {"arguments", function.at("parameters")},
+ }},
+ {"required", json::array({"name", "arguments"})},
+ };
+ if (function.contains("description")) {
+ tool_schema["description"] = function.at("description");
+ }
+ if (inputs.parallel_tool_calls) {
+ tool_schema.at("properties")["id"] = {
+ {"type", "string"},
+ {"minLength", 4},
+ };
+ tool_schema.at("required").push_back("id");
+ }
+ tool_call_schemas.emplace_back(tool_schema);
+ });
+ const auto tool_call =
+ inputs.parallel_tool_calls
+ ? json {
+ {"type", "object"},
+ {"properties", {
+ {"tool_calls", {
+ {"type", "array"},
+ {"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
+ {"anyOf", tool_call_schemas},
+ }},
+ {"minItems", 1},
+ }},
+ }},
+ {"required", json::array({"tool_calls"})},
+ }
+ : json {
+ {"type", "object"},
+ {"properties", {
+ {"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json {
+ {"anyOf", tool_call_schemas},
+ }},
+ }},
+ {"required", json::array({"tool_call"})},
+ };
+ const auto schema =
+ inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED
+ ? json {
+ {"anyOf", json::array({
+ tool_call,
+ {
+ {"type", "object"},
+ {"properties", {
+ {"response", inputs.json_schema.is_null()
+ ? json {{"type", "string"}}
+ : inputs.json_schema
+ },
+ }},
+ {"required", json::array({"response"})},
+ },
+ })}
+ }
+ : tool_call;
+
+ data.grammar_lazy = false;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ builder.add_schema("root", schema);
+ });
+
+ auto tweaked_messages = common_chat_template::add_system(
+ inputs.messages,
+ "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
+
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
+ data.format = COMMON_CHAT_FORMAT_GENERIC;
+ return data;
+}
+
+static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ schemas.push_back({
+ {"type", "object"},
+ {"properties", {
+ // Important note: the model is probably trained to take a JSON stringified arguments value.
+ // It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
+ {"name", {
+ {"type", "string"},
+ {"const", function.at("name")},
+ }},
+ {"arguments", function.at("parameters")},
+ {"id", {
+ {"type", "string"},
+ // Nemo's template expects a 9-character alphanumeric ID.
+ {"pattern", "^[a-zA-Z0-9]{9}$"},
+ }},
+ }},
+ {"required", json::array({"name", "arguments", "id"})},
+ });
+ });
+ auto schema = json {
+ {"type", "array"},
+ {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+ {"minItems", 1},
+ };
+ if (!inputs.parallel_tool_calls) {
+ schema["maxItems"] = 1;
+ }
+ builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
+ });
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
+ data.preserved_tokens = {
+ "[TOOL_CALLS]",
+ };
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
+ return data;
+}
+
+
+// Case-insensitive find
+static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
+ auto it = std::search(
+ haystack.begin() + pos, haystack.end(),
+ needle.begin(), needle.end(),
+ [](char a, char b) { return std::tolower(a) == std::tolower(b); }
+ );
+ return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
+}
+
+static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+ const auto is_json_schema_provided = !inputs.json_schema.is_null();
+ const auto is_grammar_provided = !inputs.grammar.empty();
+ const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
+
+ // the logic requires potentially modifying the messages
+ auto tweaked_messages = inputs.messages;
+
+ auto replace_json_schema_marker = [](json & messages) -> bool {
+ static std::string marker1 = "force json schema.\n";
+ static std::string marker2 = "force json schema.";
+
+ if (messages.empty() || messages.at(0).at("role") != "system") {
+ return false;
+ }
+
+ std::string content = messages.at(0).at("content");
+
+ for (const auto & marker : {marker1, marker2}) {
+ const auto pos = ifind_string(content, marker);
+ if (pos != std::string::npos) {
+ content.replace(pos, marker.length(), "");
+ // inject modified content back into the messages
+ messages.at(0).at("content") = content;
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ // Lfm2 model does not natively work with json, but can generally understand the tools structure
+ //
+ // Example of the pytorch dialog structure:
+ // <|startoftext|><|im_start|>system
+ // List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
+ // <|im_start|>user
+ // What is the current status of candidate ID 12345?<|im_end|>
+ // <|im_start|>assistant
+ // <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
+ // <|im_start|>tool
+ // <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
+ // <|im_start|>assistant
+ // The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
+ //
+ // For the llama server compatibility with json tools semantic,
+ // the client can add "Follow json schema." line into the system message prompt to force the json output.
+ //
+ if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
+ // server/utils.hpp prohibits that branch for the custom grammar anyways
+ throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
+ } else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
+ LOG_INF("%s: Using tools to build a grammar\n", __func__);
+
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ schemas.push_back({
+ {"type", "object"},
+ {"properties", {
+ {"name", {
+ {"type", "string"},
+ {"const", function.at("name")},
+ }},
+ {"arguments", function.at("parameters")},
+ }},
+ {"required", json::array({"name", "arguments", "id"})},
+ });
+ });
+ auto schema = json {
+ {"type", "array"},
+ {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+ {"minItems", 1},
+ };
+ if (!inputs.parallel_tool_calls) {
+ schema["maxItems"] = 1;
+ }
+
+ builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
+ });
+ // model has no concept of tool selection mode choice,
+ // if the system prompt rendered correctly it will produce a tool call
+ // the grammar goes inside the tool call body
+ data.grammar_lazy = true;
+ data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
+ data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
+ data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
+ } else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
+ LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
+ // output those tokens
+ data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
+ } else if (is_json_schema_provided) {
+ LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
+ data.grammar = json_schema_to_grammar(inputs.json_schema);
+ } else if (is_grammar_provided) {
+ LOG_INF("%s: Using provided grammar\n", __func__);
+ data.grammar = inputs.grammar;
+ } else {
+ LOG_INF("%s: Using content relying on the template\n", __func__);
+ }
+
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
+ LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_ministral_3(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // Build up messages to follow the format: https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512/blob/main/chat_template.jinja
+ auto adjusted_messages = json::array();
+ for (const auto & msg : inputs.messages) {
+ auto role = msg.value("role", "");
+ if (role != "system" && role != "assistant") {
+ // Only adjust system and assistant messages. Interestingly, the system message may contain thinking.
+ adjusted_messages.push_back(msg);
+ continue;
+ }
+
+ auto content = json::array();
+
+ // If message contains `reasoning_content`, add it as a block of type `thinking`
+ if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
+ content.push_back({
+ {"type", "thinking"},
+ {"thinking", msg.at("reasoning_content").get()},
+ });
+ }
+
+ // If message contains `content`, add it as a block of type `text`
+ if (msg.contains("content")) {
+ if (msg.at("content").is_string()) {
+ content.push_back({
+ {"type", "text"},
+ {"text", msg.at("content").get()},
+ });
+ } else if (msg.at("content").is_array()) {
+ auto blocks = msg.at("content");
+ content.insert(content.end(), blocks.begin(), blocks.end());
+ }
+ }
+
+ auto adjusted = msg;
+ adjusted["content"] = content;
+ adjusted.erase("reasoning_content");
+ adjusted_messages.push_back(adjusted);
+ }
+
+ auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+ auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+ auto include_grammar = true;
+
+ data.prompt = apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
+ data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
+ data.preserved_tokens = {
+ "[THINK]",
+ "[/THINK]",
+ "[TOOL_CALLS]",
+ "[ARGS]",
+ };
+
+ auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+ auto reasoning = extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();
+
+ // Response format parser
+ if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
+ // Ministral wants to emit json surrounded by code fences
+ return reasoning << "```json" << p.content(p.schema(p.json(), "response-format", inputs.json_schema)) << "```";
+ }
+
+ // Tool call parser
+ if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+ auto tool_choice = p.choice();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ const auto & schema = function.at("parameters");
+
+ tool_choice |= p.rule("tool-" + name,
+ p.tool_open(p.tool_name(p.literal(name)) + "[ARGS]")
+ + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema))
+ );
+ });
+
+ auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
+ auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
+ auto tool_calls = p.trigger_rule("tool-call", p.repeat("[TOOL_CALLS]" + tool_choice, min_calls, max_calls));
+
+ return reasoning << p.content(p.until("[TOOL_CALLS]")) << tool_calls;
+ }
+
+ // Content only parser
+ include_grammar = false;
+ return reasoning << p.content(p.rest());
+ });
+
+ data.parser = parser.save();
+
+ if (include_grammar) {
+ data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ auto schema = function.at("parameters");
+ builder.resolve_refs(schema);
+ });
+ parser.build_grammar(builder, data.grammar_lazy);
+ });
+
+ data.grammar_triggers = {
+ {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}
+ };
+ }
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
+ data.preserved_tokens = {
+ "[THINK]",
+ "[/THINK]",
+ };
+
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ schemas.push_back({
+ {"type", "object"},
+ {"properties", {
+ {"name", {
+ {"type", "string"},
+ {"const", function.at("name")},
+ }},
+ {"arguments", function.at("parameters")},
+ {"id", {
+ {"type", "string"},
+ {"pattern", "^[a-zA-Z0-9]{9}$"},
+ }},
+ }},
+ {"required", json::array({"name", "arguments", "id"})},
+ });
+ });
+ auto schema = json {
+ {"type", "array"},
+ {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+ {"minItems", 1},
+ };
+ if (!inputs.parallel_tool_calls) {
+ schema["maxItems"] = 1;
+ }
+ builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
+ });
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
+ data.preserved_tokens.push_back("[TOOL_CALLS]");
+ } else {
+ data.grammar_lazy = false;
+ if (!inputs.json_schema.is_null()) {
+ if (!inputs.grammar.empty()) {
+ throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+ }
+ data.grammar = json_schema_to_grammar(inputs.json_schema);
+ } else {
+ data.grammar = inputs.grammar;
+ }
+ }
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ auto adjusted_messages = json::array();
+ for (const auto & msg : inputs.messages) {
+ auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
+ auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
+ if (has_reasoning_content && has_tool_calls) {
+ auto adjusted_message = msg;
+ adjusted_message["tool_plan"] = msg.at("reasoning_content");
+ adjusted_message.erase("reasoning_content");
+ adjusted_messages.push_back(adjusted_message);
+ } else {
+ adjusted_messages.push_back(msg);
+ }
+ }
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
+ data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
+ if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "<|END_THINKING|>";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
+ data.prompt += "<|START_THINKING|><|END_THINKING|>";
+ }
+
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ schemas.push_back({
+ {"type", "object"},
+ {"properties", {
+ {"tool_call_id", {
+ {"type", "string"},
+ // Command-R's template expects an integer string.
+ {"pattern", "^[0-9]{1,10}$"},
+ }},
+ {"tool_name", {
+ {"type", "string"},
+ {"const", function.at("name")},
+ }},
+ {"parameters", function.at("parameters")},
+ }},
+ {"required", json::array({"tool_call_id", "tool_name", "parameters"})},
+ });
+ });
+ auto schema = json {
+ {"type", "array"},
+ {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+ {"minItems", 1},
+ };
+ if (!inputs.parallel_tool_calls) {
+ schema["maxItems"] = 1;
+ }
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"<|END_THINKING|>\" space )? " : "") +
+ "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
+ });
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ // If thinking_forced_open, then we capture the tag in the grammar,
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(<\\|END_THINKING\\|>\\s*)" : "(?:<\\|START_THINKING\\|>[\\s\\S]*?<\\|END_THINKING\\|>\\s*)?") +
+ "(<\\|START_ACTION\\|>)[\\s\\S]*"
+ });
+ data.preserved_tokens = {
+ "<|START_ACTION|>",
+ "<|END_ACTION|>",
+ "<|START_RESPONSE|>",
+ "<|END_RESPONSE|>",
+ "<|START_THINKING|>",
+ "<|END_THINKING|>",
+ };
+ return data;
+}
+
+static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector & expected_properties) {
+ if (!parameters.is_object() || !parameters.contains("type") || parameters.at("type") != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
+ throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
+ }
+ const auto & parameters_properties = parameters.at("properties");
+ const auto & parameters_required = parameters.at("required");
+ for (const auto & prop : expected_properties) {
+ if (!parameters_properties.contains(prop)) {
+ throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop); // NOLINT
+ }
+ if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) {
+ throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop); // NOLINT
+ }
+ }
+ if (parameters_properties.size() != expected_properties.size()) {
+ throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", "));
+ }
+}
+
+static common_chat_params common_chat_params_init_llama_3_x(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) {
+ auto builtin_tools = json::array();
+ common_chat_params data;
+ if (!inputs.tools.is_null()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+
+ auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
+ if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") {
+ // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
+ // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
+ expect_tool_parameters(name, parameters, {"query"});
+ } else if (name == "python" || name == "code_interpreter") {
+ // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+ expect_tool_parameters(name, parameters, {"code"});
+ } else {
+ return false;
+ }
+
+ std::vector kvs;
+ for (const auto & [key, value] : parameters.at("properties").items()) {
+ kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT
+ }
+
+ tool_rules.push_back(
+ builder.add_rule(
+ name + "-call",
+ "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
+ builtin_tools.push_back(name);
+
+ return true;
+ };
+
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+
+ // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
+ if (allow_python_tag_builtin_tools) {
+ handle_builtin_tool(name, parameters);
+ }
+ tool_rules.push_back(
+ builder.add_rule(
+ name + "-call",
+ "\"{\" space "
+ "( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
+ " \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
+ " \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
+ "\"}\" space"));
+ });
+ // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ "(\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\")[\\s\\S]*", // + name + "\"[\\s\\S]*",
+ });
+ if (!builtin_tools.empty()) {
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+ data.preserved_tokens.push_back("<|python_tag|>");
+ }
+ // Allow a few empty lines on top of the usual constrained json schema space rule.
+ builder.add_rule("root", string_join(tool_rules, " | "));
+ data.additional_stops.push_back("<|eom_id|>");
+ });
+ data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
+ ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
+ : COMMON_CHAT_FORMAT_LLAMA_3_X;
+ } else {
+ data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+ }
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
+ {"date_string", format_time(inputs.now, "%d %b %Y")},
+ {"tools_in_user_message", false},
+ {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
+ });
+ return data;
+}
+
+static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // Generate the prompt using the apply() function with the template
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
+
+ // Handle thinking tags appropriately based on inputs.enable_thinking
+ if (string_ends_with(data.prompt, "\n")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ // When tools are present, build grammar for the format, similar to CommandR, but without tool call ID
+ if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = true;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ schemas.push_back({
+ { "type", "object" },
+ { "properties",
+ {
+ { "name",
+ {
+ { "type", "string" },
+ { "const", function.at("name") },
+ } },
+ { "arguments", function.at("parameters") },
+ } },
+ { "required", json::array({ "name", "arguments" }) },
+ });
+ });
+ auto schema = json{
+ { "type", "array" },
+ { "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
+ { "minItems", 1 },
+ };
+ if (!inputs.parallel_tool_calls) {
+ schema["maxItems"] = 1;
+ }
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"\" space )? " : "") +
+ "\"\" " + builder.add_schema("tool_calls", schema) +
+ " \"\"");
+ });
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ // If thinking_forced_open, then we capture the tag in the grammar,
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+ std::string(data.thinking_forced_open ?
+ "[\\s\\S]*?(\\s*)" :
+ "(?:[\\s\\S]*?\\s*)?") +
+ "()[\\s\\S]*" });
+ }
+ return data;
+}
+
+static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
+
+ // Handle thinking tags appropriately based on inputs.enable_thinking
+ if (string_ends_with(data.prompt, "\n")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+
+ auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
+ auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+ auto include_grammar = true;
+
+ auto parser = build_chat_peg_constructed_parser([&](auto & p) {
+ auto reasoning = p.eps();
+ if (inputs.enable_thinking && extract_reasoning) {
+ auto reasoning_content = p.reasoning(p.until("")) + ("" | p.end());
+ if (data.thinking_forced_open) {
+ reasoning = reasoning_content;
+ }
+ }
+
+ // Response format parser
+ if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) {
+ return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema));
+ }
+
+ // Tool call parser
+ if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+ auto tool_choice = p.choice();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+
+ auto schema_info = common_schema_info();
+ schema_info.resolve_refs(parameters);
+
+ auto tool_open = "\n";
+ auto tool_close = p.literal("\n");
+ auto args = p.sequence();
+ auto arg_string = p.rule("xml-arg-string", p.until_one_of({
+ "\n",
+ "\n"
+ }));
+
+ foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) {
+ auto rule_name = "tool-" + name + "-arg-" + param_name;
+
+ auto arg_open = "\n";
+ auto arg_close = p.literal("\n");
+ auto arg_value = p.eps();
+
+ if (schema_info.resolves_to_string(param_schema)) {
+ arg_value = p.tool_arg_string_value(arg_string) + "\n";
+ } else {
+ arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema));
+ }
+
+ // Model may or my not close with
+ auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close)));
+ args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1);
+ });
+
+ tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close));
+ });
+
+ auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
+ auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
+ auto tool_call = p.rule("tool-call", "\n" + tool_choice + "" + p.space());
+ auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
+
+ return reasoning << p.content(p.until("")) << tool_calls;
+ }
+
+ // Content only parser
+ include_grammar = false;
+ return reasoning << p.content(p.rest());
+ });
+
+ data.parser = parser.save();
+
+ if (include_grammar) {
+ data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
+
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ auto schema = function.at("parameters");
+ builder.resolve_refs(schema);
+ });
+ parser.build_grammar(builder, data.grammar_lazy);
+ });
+
+ data.grammar_triggers = {
+ {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ""}
+ };
+ }
+
+ return data;
+}
+
+
+static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // Generate the prompt using the apply() function with the template
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_APERTUS;
+
+ // Handle thinking tags appropriately based on inputs.enable_thinking
+ if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "<|inner_suffix|>";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ // When tools are present, build grammar for the <|tools_prefix|> format
+ if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = true;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ schemas.push_back({
+ { "type", "object" },
+ { "properties",
+ {
+ { function.at("name"), function.at("parameters") }
+ } },
+ { "required", json::array({ function.at("name") }) },
+ });
+ });
+ auto schema = json{
+ { "type", "array" },
+ { "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
+ { "minItems", 1 },
+ };
+ if (!inputs.parallel_tool_calls) {
+ schema["maxItems"] = 1;
+ }
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
+ "\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
+ });
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ // If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+ std::string(data.thinking_forced_open ?
+ "[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
+ "(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
+ "(<\\|tools_prefix\\|>)[\\s\\S]*" });
+ data.preserved_tokens = {
+ "<|system_start|>",
+ "<|system_end|>",
+ "<|developer_start|>",
+ "<|developer_end|>",
+ "<|user_start|>",
+ "<|user_end|>",
+ "<|assistant_start|>",
+ "<|assistant_end|>",
+ "<|inner_prefix|>",
+ "<|inner_suffix|>",
+ "<|tools_prefix|>",
+ "<|tools_suffix|>",
+ };
+ }
+ return data;
+}
+
+static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+ auto prompt = apply(tmpl, inputs);
+
+ // Hacks to fix the official (broken) prompt.
+ // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
+ // until the official template is fixed.
+ if (tmpl.source().find("{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}") != std::string::npos) {
+ // Don't leave the chat dangling after tool results
+ if (string_ends_with(prompt, "<|tool▁outputs▁end|>")) {
+ prompt += "<|end▁of▁sentence|>";
+ if (inputs.add_generation_prompt) {
+ prompt += "<|Assistant|>";
+ }
+ }
+ // Fix up tool call delta example added by Minja
+ prompt = std::regex_replace(
+ prompt,
+ std::regex("(<|tool▁call▁end|>)[\\s\\r\\n]*(<|tool▁outputs▁begin|>|<|User|>)"),
+ "$1<|tool▁calls▁end|><|end▁of▁sentence|>$2");
+ }
+ data.prompt = prompt;
+ data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
+ if (string_ends_with(data.prompt, "\n")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ tool_rules.push_back(builder.add_rule(name + "-call",
+ "( \"<|tool▁call▁begin|>\" )? \"function<|tool▁sep|>" + name + "\\n"
+ "```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
+ "\"```<|tool▁call▁end|>\""));
+ });
+ // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
+ // so we accept common variants (then it's all constrained)
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"\" space )? " : "") +
+ "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
+ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
+ "\"<|tool▁calls▁end|>\""
+ " space");
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ // If thinking_forced_open, then we capture the tag in the grammar,
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") +
+ "(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
+ });
+ data.preserved_tokens = {
+ "",
+ "",
+ "<|tool▁calls▁begin|>",
+ "<|tool▁call▁begin|>",
+ "<|tool▁sep|>",
+ "<|tool▁call▁end|>",
+ "<|tool▁calls▁end|",
+ };
+ });
+ }
+ return data;
+}
+
+static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // Pass thinking context for DeepSeek V3.1 template
+ json additional_context = {
+ {"thinking", inputs.enable_thinking},
+ };
+
+ auto prompt = apply(tmpl, inputs,
+ /* messages_override= */ inputs.messages,
+ /* tools_override= */ std::nullopt,
+ additional_context);
+ data.prompt = prompt;
+ data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+ if (string_ends_with(data.prompt, "")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ tool_rules.push_back(builder.add_rule(name + "-call",
+ "( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
+ "\" " + builder.add_schema(name + "-args", parameters) + " "
+ "\"<|tool▁call▁end|>\""));
+ });
+ // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
+ // so we accept common variants (then it's all constrained)
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"\" space )? " : "") +
+ "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
+ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
+ "\"<|tool▁calls▁end|>\""
+ " space");
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ // If thinking_forced_open, then we capture the tag in the grammar,
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") +
+ "(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
+ });
+ data.preserved_tokens = {
+ "",
+ "",
+ "<|tool▁calls▁begin|>",
+ "<|tool▁call▁begin|>",
+ "<|tool▁sep|>",
+ "<|tool▁call▁end|>",
+ "<|tool▁calls▁end|>",
+ };
+ });
+ }
+ return data;
+}
+
+static common_chat_params common_chat_params_init_minimax_m2(const common_chat_template & tmpl, const struct templates_params & params) {
+ common_chat_params data;
+ data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+ data.prompt = apply(tmpl, params);
+ data.format = COMMON_CHAT_FORMAT_MINIMAX_M2;
+
+ // Handle thinking tags based on prompt ending
+ if (string_ends_with(data.prompt, "\n")) {
+ if (!params.enable_thinking) {
+ // Close the thinking tag immediately if thinking is disabled
+ data.prompt += "\n\n";
+ } else {
+ // Mark thinking as forced open (template started with )
+ data.thinking_forced_open = true;
+ }
+ }
+
+ // Preserve MiniMax-M2 special tokens
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+
+ // build grammar for tool call
+ static const xml_tool_call_format form {
+ /* form.scope_start = */ "\n",
+ /* form.tool_start = */ "\n",
+ /* form.key_start = */ "",
+ /* form.val_end = */ "\n",
+ /* form.tool_end = */ "\n",
+ /* form.scope_end = */ "",
+ };
+ build_grammar_xml_tool_call(data, params.tools, form);
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
+ common_chat_params data;
+ data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+ data.prompt = apply(tmpl, params);
+ data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
+
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+
+ // build grammar for tool call
+ static const xml_tool_call_format form {
+ /* form.scope_start = */ "\n",
+ /* form.tool_start = */ "\n",
+ /* form.key_start = */ "\n",
+ /* form.val_end = */ "\n\n",
+ /* form.tool_end = */ "\n",
+ /* form.scope_end = */ "",
+ };
+ build_grammar_xml_tool_call(data, params.tools, form);
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
+ common_chat_params data;
+ data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+ data.prompt = apply(tmpl, params);
+ data.format = COMMON_CHAT_FORMAT_KIMI_K2;
+
+ data.preserved_tokens = {
+ "",
+ "",
+ "<|tool_calls_section_begin|>",
+ "<|tool_call_begin|>",
+ "<|tool_call_argument_begin|>",
+ "<|tool_call_end|>",
+ "<|tool_calls_section_end|>",
+ "<|im_end|>",
+ "<|im_system|>",
+ "<|im_middle|>",
+ };
+
+ data.additional_stops.insert(data.additional_stops.end(), {
+ "<|im_end|>",
+ "<|im_middle|>"
+ });
+ // build grammar for tool call
+ static const xml_tool_call_format form = ([]() {
+ xml_tool_call_format form {};
+ form.scope_start = "<|tool_calls_section_begin|>";
+ form.tool_start = "<|tool_call_begin|>";
+ form.tool_sep = "<|tool_call_argument_begin|>{";
+ form.key_start = "\"";
+ form.key_val_sep = "\": ";
+ form.val_end = ", ";
+ form.tool_end = "}<|tool_call_end|>";
+ form.scope_end = "<|tool_calls_section_end|>";
+ form.raw_argval = false;
+ form.last_val_end = "";
+ return form;
+ })();
+ build_grammar_xml_tool_call(data, params.tools, form);
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_apriel_1_5(const common_chat_template & tmpl, const struct templates_params & params) {
+ common_chat_params data;
+ data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+ data.prompt = apply(tmpl, params);
+ data.format = COMMON_CHAT_FORMAT_APRIEL_1_5;
+
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+
+ // build grammar for tool call
+ static const xml_tool_call_format form = ([]() {
+ xml_tool_call_format form {};
+ form.scope_start = "[";
+ form.tool_start = "{\"name\": \"";
+ form.tool_sep = "\", \"arguments\": {";
+ form.key_start = "\"";
+ form.key_val_sep = "\": ";
+ form.val_end = ", ";
+ form.tool_end = "}, ";
+ form.scope_end = "]";
+ form.raw_argval = false;
+ form.last_val_end = "";
+ form.last_tool_end = "}";
+ return form;
+ })();
+ build_grammar_xml_tool_call(data, params.tools, form);
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_xiaomi_mimo(const common_chat_template & tmpl, const struct templates_params & params) {
+ common_chat_params data;
+ data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+ data.prompt = apply(tmpl, params);
+ data.format = COMMON_CHAT_FORMAT_XIAOMI_MIMO;
+
+ data.preserved_tokens = {
+ "",
+ "",
+ };
+
+ // build grammar for tool call
+ static const xml_tool_call_format form = ([]() {
+ xml_tool_call_format form {};
+ form.scope_start = "\n";
+ form.tool_start = "\n{\"name\": \"";
+ form.tool_sep = "\", \"arguments\": {";
+ form.key_start = "\"";
+ form.key_val_sep = "\": ";
+ form.val_end = ", ";
+ form.tool_end = "}\n";
+ form.scope_end = "";
+ form.raw_argval = false;
+ form.last_val_end = "";
+ return form;
+ })();
+ build_grammar_xml_tool_call(data, params.tools, form);
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // Copy reasoning to the "thinking" field as expected by the gpt-oss template
+ auto adjusted_messages = json::array();
+ for (const auto & msg : inputs.messages) {
+ auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
+ auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
+
+ if (has_reasoning_content && has_tool_calls) {
+ auto adjusted_message = msg;
+ adjusted_message["thinking"] = msg.at("reasoning_content");
+ adjusted_messages.push_back(adjusted_message);
+ } else {
+ adjusted_messages.push_back(msg);
+ }
+ }
+
+ auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
+
+ // Check if we need to replace the return token with end token during
+ // inference and without generation prompt. For more details see:
+ // https://github.com/ggml-org/llama.cpp/issues/15417
+ if (inputs.is_inference && !inputs.add_generation_prompt) {
+ static constexpr std::string_view return_token = "<|return|>";
+ static constexpr std::string_view end_token = "<|end|>";
+ if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
+ prompt.replace(pos, return_token.length(), end_token);
+ }
+ }
+
+ data.prompt = prompt;
+ data.format = COMMON_CHAT_FORMAT_GPT_OSS;
+
+ // These special tokens are required to parse properly, so we include them
+ // even if parse_tool_calls is false.
+ data.preserved_tokens = {
+ "<|channel|>",
+ "<|constrain|>",
+ "<|message|>",
+ "<|start|>",
+ "<|end|>",
+ };
+
+ if (!inputs.json_schema.is_null()) {
+ data.grammar_lazy = false;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schema = inputs.json_schema;
+ builder.resolve_refs(schema);
+
+ auto not_end = builder.add_rule("not-end",
+ "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
+ auto analysis = builder.add_rule("analysis",
+ "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
+ auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
+ auto final = builder.add_rule("final",
+ "\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
+ builder.add_schema("response", schema)
+ );
+
+ builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
+ });
+ }
+
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ // tool calls can appear in commentary or analysis channels
+ auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
+
+ std::vector tool_rules_recipient_in_role;
+ std::vector tool_rules_recipient_in_channel;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+
+ tool_rules_recipient_in_role.push_back(
+ builder.add_rule(name + "-call",
+ "\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
+ builder.add_schema(name + "-args", parameters)
+ )
+ );
+
+ tool_rules_recipient_in_channel.push_back(
+ builder.add_rule(name + "-call",
+ "\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
+ builder.add_schema(name + "-args", parameters)
+ )
+ );
+ });
+
+ auto recipient_in_channel = builder.add_rule("recipient_in_channel",
+ channel + " \" to=functions.\" ( " +
+ string_join(tool_rules_recipient_in_channel, " | ") + " )"
+ );
+
+ if (data.grammar_lazy) {
+ auto recipient_in_role = builder.add_rule("recipient_in_role",
+ "\"<|start|>assistant\"? \" to=functions.\" ( " +
+ string_join(tool_rules_recipient_in_role, " | ") + " )"
+ );
+
+ builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
+ } else {
+ auto not_end = builder.add_rule("not-end",
+ "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
+ auto analysis = builder.add_rule("analysis",
+ "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
+ auto commentary = builder.add_rule("commentary",
+ "\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
+
+ auto recipient_in_role = builder.add_rule("recipient_in_role",
+ "\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
+ );
+
+ builder.add_rule("root",
+ "( " + analysis + " \"<|start|>assistant\" )? " +
+ "( " + commentary + " \"<|start|>assistant\" )? " +
+ "( " + recipient_in_role + " | " + recipient_in_channel + " )"
+ );
+ }
+
+ // Trigger on tool calls that appear in the commentary channel
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+ "<\\|channel\\|>(?:commentary|analysis) to"
+ });
+
+ // Trigger tool calls that appear in the role section, either at the
+ // start or in the middle.
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ "^ to"
+ });
+
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+ "<\\|start\\|>assistant to"
+ });
+ });
+ }
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_glm_4_5(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+ data.grammar_lazy = inputs.tools.is_array() && !inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+
+ std::string prompt = apply(tmpl, inputs);
+
+ // match the existing trimming behavior
+ if (inputs.add_bos && string_starts_with(prompt, tmpl.bos_token())) {
+ prompt.erase(0, tmpl.bos_token().size());
+ }
+ if (inputs.add_eos && string_ends_with(prompt, tmpl.eos_token())) {
+ prompt.erase(prompt.size() - tmpl.eos_token().size());
+ }
+ if (string_ends_with(prompt, "")) {
+ if (!inputs.enable_thinking) {
+ prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ // add GLM preserved tokens
+ data.preserved_tokens = {
+ "<|endoftext|>",
+ "[MASK]",
+ "[gMASK]",
+ "[sMASK]",
+ "",
+ "",
+ "<|system|>",
+ "<|user|>",
+ "<|assistant|>",
+ "<|observation|>",
+ "<|begin_of_image|>",
+ "<|end_of_image|>",
+ "<|begin_of_video|>",
+ "<|end_of_video|>",
+ "<|begin_of_audio|>",
+ "<|end_of_audio|>",
+ "<|begin_of_transcription|>",
+ "<|end_of_transcription|>",
+ "<|code_prefix|>",
+ "<|code_middle|>",
+ "<|code_suffix|>",
+ "/nothink",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ ""
+ };
+
+ // extra GLM 4.5 stop word
+ data.additional_stops.insert(data.additional_stops.end(), {
+ "<|user|>",
+ "<|observation|>"
+ });
+
+ // build grammar for tool call
+ static const xml_tool_call_format form {
+ /* form.scope_start = */ "",
+ /* form.tool_start = */ "\n",
+ /* form.tool_sep = */ "\n",
+ /* form.key_start = */ "",
+ /* form.key_val_sep = */ "\n",
+ /* form.val_end = */ "\n",
+ /* form.tool_end = */ "\n",
+ /* form.scope_end = */ "",
+ };
+ build_grammar_xml_tool_call(data, inputs.tools, form);
+
+ data.prompt = prompt;
+ data.format = COMMON_CHAT_FORMAT_GLM_4_5;
+ return data;
+}
+
+static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ LOG_DBG("%s\n", __func__);
+ common_chat_params data;
+ const std::optional tools_override = json();
+ const std::optional additional_context = json {
+ {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
+ {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
+ };
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ auto schemas = json::array();
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ schemas.push_back({
+ {"type", "object"},
+ {"properties", {
+ {"name", {
+ {"type", "string"},
+ {"const", function.at("name")},
+ }},
+ {"arguments", function.at("parameters")},
+ }},
+ {"required", json::array({"name", "arguments", "id"})},
+ });
+ });
+ auto schema = json {
+ {"type", "array"},
+ {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+ {"minItems", 1},
+ };
+ if (!inputs.parallel_tool_calls) {
+ schema["maxItems"] = 1;
+ }
+ builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
+ });
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["});
+ data.preserved_tokens = {
+ " functools[",
+ };
+ data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
+ } else {
+ data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+ }
+ return data;
+}
+
+static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
+ // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
+ // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
+ common_chat_params data;
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector first_tool_rules;
+ std::vector subsequent_tool_rules;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ std::string args_pattern = "[\\s\\S]*";
+ auto args_rule = builder.add_schema(name + "-args", parameters);
+ if (name == "python") {
+ args_rule = builder.add_rule(name + "-maybe-raw-args", args_rule + " | [^{] .*");
+ } else {
+ args_pattern = "\\{" + args_pattern;
+ }
+ auto call_rule = builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule);
+ first_tool_rules.push_back(call_rule);
+ if (inputs.parallel_tool_calls) {
+ subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>\" " + call_rule));
+ }
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+ "((?:[\\s\\S]+?>>>)?" + regex_escape(name) + "\n)" + args_pattern,
+ });
+ });
+ data.preserved_tokens = {
+ "<|end_header_id|>",
+ };
+ auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
+ if (inputs.parallel_tool_calls) {
+ auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
+ builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*");
+ } else {
+ builder.add_rule("root", first_rule);
+ }
+
+ });
+ }
+ return data;
+}
+
+static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
+ common_chat_params data;
+
+ if (!inputs.tools.is_null()) {
+ std::string python_code_argument_name;
+ auto has_raw_python = false;
+
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ const auto & parameters = function.at("parameters");
+ std::string name = function.at("name");
+ if (name == "python" || name == "ipython") {
+ if (!parameters.contains("type")) {
+ throw std::runtime_error("Missing type in python tool");
+ }
+ has_raw_python = true;
+ const auto & type = parameters.at("type");
+ if (type == "object") {
+ auto properties = parameters.at("properties");
+ for (auto it = properties.begin(); it != properties.end(); ++it) {
+ if (it.value().at("type") == "string") {
+ if (!python_code_argument_name.empty()) {
+ throw std::runtime_error("Multiple string arguments found in python tool");
+ }
+ python_code_argument_name = it.key();
+ }
+ }
+ if (python_code_argument_name.empty()) {
+ throw std::runtime_error("No string argument found in python tool");
+ }
+ } else if (type != "string") {
+ throw std::runtime_error("Invalid type in python tool: " + type.dump());
+ }
+ }
+ tool_rules.push_back(builder.add_rule(name + "-call", "\"\" " + builder.add_schema(name + "-args", parameters) + " \"\" space"));
+ });
+ if (has_raw_python) {
+ tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+ data.preserved_tokens.push_back("<|python_tag|>");
+ }
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
+ builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
+ data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "\n")) {
+ if (!extra_context["enable_thinking"]) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ if (!inputs.tools.is_null()) {
+ // (content)?({"name": "foo", "arguments": {"a": 1}})*
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ std::vector tool_call_alts;
+ std::vector escaped_names;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ tool_rules.push_back(builder.add_schema(name + "-call", {
+ {"type", "object"},
+ {"properties", json {
+ {"name", json {{"const", name}}},
+ {"arguments", parameters},
+ }},
+ {"required", json::array({"name", "arguments"})},
+ }));
+ tool_call_alts.push_back(builder.add_rule(
+ name + "-function-tag",
+ "\"\" space " +
+ builder.add_schema(name + "-args", parameters) + " "
+ "\"\" space"));
+
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+ "",
+ });
+ auto escaped_name = regex_escape(name);
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+ " alt_tags {
+ any_tool_call,
+ "\"\" space " + any_tool_call + " \"\"",
+ // The rest is just to accommodate common "good bad" outputs.
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ };
+ auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
+ tool_call_alts.push_back(wrappable_tool_call);
+ tool_call_alts.push_back(
+ "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"\" space )? " : "") +
+ (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
+ // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+ // If thinking_forced_open, then we capture the tag in the grammar,
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+ std::string(data.thinking_forced_open ? "(\\s*)" : "") + (
+ "\\s*("
+ "(?:"
+ "||||)?"
+ "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
+ ")"
+ ")"
+ ),
+ });
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "```",
+ "```json",
+ "```xml",
+ };
+ });
+ }
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // Pass thinking context for Granite template
+ json additional_context = {
+ {"thinking", inputs.enable_thinking},
+ };
+
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
+ data.format = COMMON_CHAT_FORMAT_GRANITE;
+
+ if (string_ends_with(data.prompt, "\n") || string_ends_with(data.prompt, "")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ if (!inputs.tools.is_null()) {
+ // Granite uses <|tool_call|> followed by JSON list
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
+"-args", {
+ {"type", "object"},
+ {"properties", {
+ {"name", {{"const", name}}},
+ {"arguments", parameters},
+ }},
+ {"required", json::array({"name", "arguments"})},
+ })));
+ });
+
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
+ auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
+
+ if (data.thinking_forced_open) {
+ builder.add_rule("root", "\"\" space \"\" space [^<]* \"\" space \"<|tool_call|>\" space " + tool_list);
+ } else {
+ builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
+ }
+
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+ "<|tool_call|>"
+ });
+
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ "<|tool_call|>",
+ };
+ });
+ } else {
+ // Handle thinking tags for non-tool responses
+ if (data.thinking_forced_open && inputs.enable_thinking) {
+ data.grammar_lazy = false;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ builder.add_rule("root", "\"\" space \"\" space .* \"\" space");
+ });
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ };
+ }
+ }
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+
+ // TODO: Reasoning effort
+ json additional_context = {};
+
+ data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
+ data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
+
+ data.preserved_tokens = {
+ "<|think|>",
+ "<|content|>",
+ "<|begin|>",
+ "<|end|>",
+ };
+
+ // TODO: Tool calling
+
+ return data;
+}
+
+static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
+ common_chat_params data;
+ data.prompt = apply(tmpl, inputs);
+ data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+ data.grammar_lazy = false;
+ if (!inputs.json_schema.is_null()) {
+ if (!inputs.grammar.empty()) {
+ throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+ }
+ data.grammar = json_schema_to_grammar(inputs.json_schema);
+ } else {
+ data.grammar = inputs.grammar;
+ }
+ return data;
+}
+
+static common_chat_params common_chat_params_init_seed_oss(
+ const common_chat_template & tmpl,
+ templates_params & params,
+ const common_chat_templates_inputs & inputs)
+{
+ common_chat_params data;
+ data.prompt = apply(tmpl, params);
+ data.format = COMMON_CHAT_FORMAT_SEED_OSS;
+ if (string_ends_with(data.prompt, "")) {
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ }
+
+ if (params.tools.is_array() && !params.tools.empty()) {
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ foreach_function(params.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+
+ // Create rule for Seed-OSS function call format
+ std::string param_rules;
+ if (parameters.contains("properties")) {
+ for (const auto & [key, value] : parameters.at("properties").items()) {
+ param_rules += "\"\"" + builder.add_schema(name + "-arg-" + key, value) +
+ "\"\"";
+ }
+ }
+
+ tool_rules.push_back(builder.add_rule(name + "-call",
+ "\"\" space \"\" space " +
+ param_rules +
+ " \"\" space \"\""));
+ });
+
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "" });
+
+ data.preserved_tokens = {
+ "", "", "", "",
+ "", "",
+ };
+
+ builder.add_rule("root", string_join(tool_rules, " | "));
+ });
+ }
+ return data;
+}
+
+static common_chat_params common_chat_templates_apply_jinja(
+ const struct common_chat_templates * tmpls,
+ const struct common_chat_templates_inputs & inputs)
+{
+ templates_params params;
+ params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
+ const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use
+ ? *tmpls->template_tool_use
+ : *tmpls->template_default;
+ const auto & src = tmpl.source();
+ const auto & caps = tmpl.original_caps();
+ params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
+ params.add_generation_prompt = inputs.add_generation_prompt;
+ params.tool_choice = inputs.tool_choice;
+ params.reasoning_format = inputs.reasoning_format;
+ params.enable_thinking = inputs.enable_thinking;
+ params.grammar = inputs.grammar;
+ params.now = inputs.now;
+ params.add_bos = tmpls->add_bos;
+ params.add_eos = tmpls->add_eos;
+
+ params.extra_context = json::object();
+ for (auto el : inputs.chat_template_kwargs) {
+ params.extra_context[el.first] = json::parse(el.second);
+ }
+
+ if (!inputs.json_schema.empty()) {
+ params.json_schema = json::parse(inputs.json_schema);
+ }
+
+ if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
+ LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
+ params.parallel_tool_calls = false;
+ } else {
+ params.parallel_tool_calls = inputs.parallel_tool_calls;
+ }
+
+ if (params.tools.is_array()) {
+ if (params.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && !params.grammar.empty()) {
+ throw std::runtime_error("Cannot specify grammar with tools");
+ }
+ if (caps.supports_tool_calls && !caps.supports_tools) {
+ LOG_WRN("Template supports tool calls but does not natively describe tools. The fallback behaviour used may produce bad results, inspect prompt w/ --verbose & consider overriding the template.\n");
+ }
+ }
+
+ // DeepSeek V3.1: detect based on specific patterns in the template
+ if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
+ params.json_schema.is_null()) {
+ return common_chat_params_init_deepseek_v3_1(tmpl, params);
+ }
+
+ // DeepSeek R1: use handler in all cases except json schema (thinking / tools).
+ if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
+ return common_chat_params_init_deepseek_r1(tmpl, params);
+ }
+
+ // Command R7B: : use handler in all cases except json schema (thinking / tools).
+ if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && params.json_schema.is_null()) {
+ return common_chat_params_init_command_r7b(tmpl, params);
+ }
+
+ // Granite (IBM) - detects thinking / tools support
+ if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
+ return common_chat_params_init_granite(tmpl, params);
+ }
+
+ // GLM 4.5: detect by and tags (check before Hermes since both use )
+ if (src.find("[gMASK]") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ params.json_schema.is_null()) {
+ return common_chat_params_init_glm_4_5(tmpl, params);
+ }
+
+ // Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
+ // Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
+ // Require presence of , , and blocks.
+ if (src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("") != std::string::npos) {
+ return common_chat_params_init_nemotron_v3(tmpl, params);
+ }
+ return common_chat_params_init_qwen3_coder_xml(tmpl, params);
+ }
+
+ // Xiaomi MiMo format detection (must come before Hermes 2 Pro)
+ if (src.find("") != std::string::npos &&
+ src.find("# Tools") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("") != std::string::npos) {
+ return common_chat_params_init_xiaomi_mimo(tmpl, params);
+ }
+
+ // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
+ if (src.find("") != std::string::npos && params.json_schema.is_null()) {
+ return common_chat_params_init_hermes_2_pro(tmpl, params);
+ }
+
+ // GPT-OSS
+ if (src.find("<|channel|>") != std::string::npos) {
+ return common_chat_params_init_gpt_oss(tmpl, params);
+ }
+
+ // Seed-OSS
+ if (src.find("") != std::string::npos) {
+ return common_chat_params_init_seed_oss(tmpl, params, inputs);
+ }
+
+ // Nemotron v2
+ if (src.find("") != std::string::npos) {
+ return common_chat_params_init_nemotron_v2(tmpl, params);
+ }
+
+ // Apertus format detection
+ if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
+ return common_chat_params_init_apertus(tmpl, params);
+ }
+
+ // LFM2 (w/ tools)
+ if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
+ src.find("]<|tool_list_end|>") != std::string::npos) {
+ return common_chat_params_init_lfm2(tmpl, params);
+ }
+
+ // MiniMax-M2 format detection
+ if (src.find("]~!b[") != std::string::npos && src.find("]~b]") != std::string::npos) {
+ return common_chat_params_init_minimax_m2(tmpl, params);
+ }
+
+ // Kimi K2 format detection
+ if (src.find("<|im_system|>tool_declare<|im_middle|>") != std::string::npos &&
+ src.find("<|tool_calls_section_begin|>") != std::string::npos &&
+ src.find("## Return of") != std::string::npos) {
+ return common_chat_params_init_kimi_k2(tmpl, params);
+ }
+
+ // Apriel 1.5 format detection
+ if (src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("") != std::string::npos &&
+ src.find("<|assistant|>") != std::string::npos &&
+ src.find("<|tool_result|>") != std::string::npos &&
+ src.find("[") != std::string::npos &&
+ src.find("]") != std::string::npos) {
+ return common_chat_params_init_apriel_1_5(tmpl, params);
+ }
+
+ // Use generic handler when mixing tools + JSON schema.
+ // TODO: support that mix in handlers below.
+ if ((params.tools.is_array() && params.json_schema.is_object())) {
+ return common_chat_params_init_generic(tmpl, params);
+ }
+
+ // Functionary prepends "all\n" to plain content outputs, so we use its handler in all cases.
+ if (src.find(">>>all") != std::string::npos) {
+ return common_chat_params_init_functionary_v3_2(tmpl, params);
+ }
+
+ // Firefunction v2 requires datetime and functions in the context even w/o tools, so we also use its handler in all cases.
+ if (src.find(" functools[") != std::string::npos) {
+ return common_chat_params_init_firefunction_v2(tmpl, params);
+ }
+
+ // Functionary v3.1 (w/ tools)
+ if (src.find("<|start_header_id|>") != std::string::npos
+ && src.find("ipython<|end_header_id|>") != std::string::npos) {
+ auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
+ return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
+ }
+
+ // Ministral/Mistral Large 3
+ if (src.find("[SYSTEM_PROMPT]") != std::string::npos &&
+ src.find("[TOOL_CALLS]") != std::string::npos &&
+ src.find("[ARGS]") != std::string::npos) {
+ return common_chat_params_init_ministral_3(tmpl, params);
+ }
+
+ if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
+ return common_chat_params_init_magistral(tmpl, params);
+ }
+
+ // Solar Open
+ if (src.find("<|tool_response:begin|>") != std::string::npos &&
+ src.find("<|tool_response:name|>") != std::string::npos &&
+ src.find("<|tool_response:result|>") != std::string::npos) {
+ return common_chat_params_init_solar_open(tmpl, params);
+ }
+
+ // Plain handler (no tools)
+ if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
+ return common_chat_params_init_without_tools(tmpl, params);
+ }
+
+ // Mistral Nemo (w/ tools)
+ if (src.find("[TOOL_CALLS]") != std::string::npos) {
+ return common_chat_params_init_mistral_nemo(tmpl, params);
+ }
+
+ // Generic fallback
+ return common_chat_params_init_generic(tmpl, params);
+}
+
+// Legacy template route (adhoc C++ implementation of known templates), forward to llama_chat_apply_template.
+static common_chat_params common_chat_templates_apply_legacy(
+ const struct common_chat_templates * tmpls,
+ const struct common_chat_templates_inputs & inputs)
+{
+ size_t alloc_size = 0;
+ std::vector chat;
+ std::vector contents;
+
+ for (const auto & msg : inputs.messages) {
+ auto content = msg.content;
+ for (const auto & part : msg.content_parts) {
+ if (part.type != "text") {
+ LOG_WRN("Ignoring non-text content part: %s\n", part.type.c_str());
+ continue;
+ }
+ if (!content.empty()) {
+ content += "\n";;
+ }
+ content += part.text;
+ }
+ contents.emplace_back(std::move(content));
+ }
+ for (size_t i = 0; i < contents.size(); ++i) {
+ const auto & msg = inputs.messages[i];
+ const auto & content = contents[i];
+ chat.push_back({msg.role.c_str(), content.c_str()});
+ size_t msg_size = msg.role.size() + content.size();
+ alloc_size += msg_size + (msg_size / 4); // == msg_size * 1.25 but avoiding float ops
+ }
+
+ std::vector buf(alloc_size);
+
+ // run the first time to get the total output length
+ const auto & src = tmpls->template_default->source();
+ int32_t res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size());
+
+ // error: chat template is not supported
+ if (res < 0) {
+ // if the custom "tmpl" is not supported, we throw an error
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+ throw std::runtime_error("this custom template is not supported, try using --jinja");
+ }
+
+ // if it turns out that our buffer is too small, we resize it
+ if ((size_t) res > buf.size()) {
+ buf.resize(res);
+ res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size());
+ }
+
+ // for safety, we check the result again
+ if (res < 0 || (size_t) res > buf.size()) {
+ throw std::runtime_error("failed to apply chat template, try using --jinja");
+ }
+
+ common_chat_params params;
+ params.prompt = std::string(buf.data(), res);
+ if (!inputs.json_schema.empty()) {
+ params.grammar = json_schema_to_grammar(json::parse(inputs.json_schema));
+ } else {
+ params.grammar = inputs.grammar;
+ }
+ return params;
+}
+
+common_chat_params common_chat_templates_apply(
+ const struct common_chat_templates * tmpls,
+ const struct common_chat_templates_inputs & inputs)
+{
+ GGML_ASSERT(tmpls != nullptr);
+ return inputs.use_jinja
+ ? common_chat_templates_apply_jinja(tmpls, inputs)
+ : common_chat_templates_apply_legacy(tmpls, inputs);
+}
diff --git a/patches/llama-cpp-sys-2/llama.cpp/common/chat.h b/patches/llama-cpp-sys-2/llama.cpp/common/chat.h
new file mode 100644
index 0000000..8bd4a32
--- /dev/null
+++ b/patches/llama-cpp-sys-2/llama.cpp/common/chat.h
@@ -0,0 +1,234 @@
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+
+#pragma once
+
+#include "common.h"
+#include "peg-parser.h"
+#include
+#include
+#include
+#include
+#include