generated from nhcarrigan/template
feat: we successfully have the installer working for windows!
Models are downloaded at runtime instead of build.
This commit is contained in:
@@ -0,0 +1,121 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpp.h"
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
|
||||
|
||||
struct clip_graph {
|
||||
const clip_model & model;
|
||||
const clip_hparams & hparams;
|
||||
projector_type proj_type;
|
||||
|
||||
// we only support single image per batch
|
||||
const clip_image_f32 & img;
|
||||
|
||||
const int patch_size;
|
||||
const int n_patches_x;
|
||||
const int n_patches_y;
|
||||
const int n_patches;
|
||||
const int n_embd;
|
||||
const int n_head;
|
||||
const int d_head;
|
||||
const int n_layer;
|
||||
const int n_mmproj_embd;
|
||||
const float eps;
|
||||
const float kq_scale;
|
||||
const clip_flash_attn_type flash_attn_type;
|
||||
|
||||
// for debugging
|
||||
const bool debug_graph;
|
||||
std::vector<ggml_tensor *> & debug_print_tensors;
|
||||
|
||||
ggml_context_ptr ctx0_ptr;
|
||||
ggml_context * ctx0;
|
||||
ggml_cgraph * gf;
|
||||
|
||||
clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
|
||||
|
||||
virtual ~clip_graph() = default;
|
||||
virtual ggml_cgraph * build() = 0;
|
||||
|
||||
//
|
||||
// utility functions
|
||||
//
|
||||
void cb(ggml_tensor * cur0, const char * name, int il) const;
|
||||
|
||||
// siglip2 naflex
|
||||
ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
|
||||
|
||||
// build vision transformer (ViT) cgraph
|
||||
// this function should cover most of the models
|
||||
// if your model has specific features, you should probably duplicate this function
|
||||
ggml_tensor * build_vit(
|
||||
ggml_tensor * inp,
|
||||
int64_t n_pos,
|
||||
norm_type norm_t,
|
||||
ffn_op_type ffn_t,
|
||||
ggml_tensor * learned_pos_embd,
|
||||
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
|
||||
|
||||
// build the input after conv2d (inp_raw --> patches)
|
||||
// returns tensor with shape [n_embd, n_patches]
|
||||
ggml_tensor * build_inp();
|
||||
|
||||
ggml_tensor * build_inp_raw(int channels = 3);
|
||||
|
||||
ggml_tensor * build_norm(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * mw,
|
||||
ggml_tensor * mb,
|
||||
norm_type type,
|
||||
float norm_eps,
|
||||
int il) const;
|
||||
|
||||
ggml_tensor * build_ffn(
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * up,
|
||||
ggml_tensor * up_b,
|
||||
ggml_tensor * gate,
|
||||
ggml_tensor * gate_b,
|
||||
ggml_tensor * down,
|
||||
ggml_tensor * down_b,
|
||||
ffn_op_type type_op,
|
||||
int il) const;
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_mask,
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
|
||||
// implementation of the 2D RoPE without adding a new op in ggml
|
||||
// this is not efficient (use double the memory), but works on all backends
|
||||
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
|
||||
ggml_tensor * build_rope_2d(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * pos_a, // first half
|
||||
ggml_tensor * pos_b, // second half
|
||||
const float freq_base,
|
||||
const bool interleave_freq
|
||||
);
|
||||
|
||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||
// support dynamic resolution
|
||||
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
||||
|
||||
// Generic function to stack frames for audio processing
|
||||
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
|
||||
};
|
||||
@@ -0,0 +1,578 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "clip.h"
|
||||
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <cinttypes>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
// Internal header for clip.cpp
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
#define KEY_FTYPE "general.file_type"
|
||||
#define KEY_NAME "general.name"
|
||||
#define KEY_DESCRIPTION "general.description"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
#define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder"
|
||||
#define KEY_HAS_VISION_ENC "clip.has_vision_encoder"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_USE_SILU "clip.use_silu"
|
||||
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
|
||||
// vision-specific
|
||||
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
|
||||
#define KEY_IMAGE_SIZE "clip.vision.image_size"
|
||||
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
|
||||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
|
||||
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
|
||||
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
|
||||
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
// audio-specific
|
||||
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
|
||||
|
||||
|
||||
//
|
||||
// tensor name constants
|
||||
//
|
||||
|
||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||
#define TN_NORM_EMBD "v.norm_embd.%s"
|
||||
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
|
||||
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
|
||||
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
|
||||
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
|
||||
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
|
||||
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
|
||||
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
|
||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||
#define TN_LN_POST "%s.post_ln.%s"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MM_UP "mm.up.%s"
|
||||
#define TN_MM_GATE "mm.gate.%s"
|
||||
#define TN_MM_DOWN "mm.down.%s"
|
||||
#define TN_MM_POST_NORM "mm.post_norm.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
|
||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||
#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack
|
||||
#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack
|
||||
#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack
|
||||
|
||||
// mimicpmv
|
||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||
#define TN_MINICPMV_QUERY "resampler.query"
|
||||
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
||||
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
||||
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
||||
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
||||
|
||||
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
|
||||
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
|
||||
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
|
||||
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||
|
||||
// ultravox
|
||||
#define TN_CONV1D "a.conv1d.%d.%s"
|
||||
#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
|
||||
#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer
|
||||
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
|
||||
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
|
||||
|
||||
// cogvlm
|
||||
#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s"
|
||||
#define TN_MM_H_TO_4H "mm.up.%s"
|
||||
#define TN_MM_GATE "mm.gate.%s"
|
||||
#define TN_MM_4H_TO_H "mm.down.%s"
|
||||
#define TN_TOK_BOI "v.boi"
|
||||
#define TN_TOK_EOI "v.eoi"
|
||||
|
||||
// (conformer) lfm2
|
||||
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
|
||||
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
|
||||
#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s"
|
||||
#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s"
|
||||
#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s"
|
||||
#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u"
|
||||
#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v"
|
||||
#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s"
|
||||
#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s"
|
||||
#define TN_CONV_DW "%s.blk.%d.conv_dw.%s"
|
||||
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
|
||||
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
|
||||
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
|
||||
|
||||
// mobilenetv5 (gemma3n) definitions
|
||||
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
|
||||
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
|
||||
#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
|
||||
|
||||
// Stage 0 Block (Edge Residual)
|
||||
#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
|
||||
#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
|
||||
#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
|
||||
#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
|
||||
|
||||
// Stage 1+ Block (Universal Inverted Residual)
|
||||
#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
|
||||
#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
|
||||
#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
|
||||
#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
|
||||
|
||||
// Attention Components
|
||||
#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
|
||||
#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
|
||||
#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
|
||||
#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
|
||||
#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
|
||||
#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
|
||||
#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
|
||||
|
||||
// MSFA
|
||||
#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
|
||||
#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
|
||||
#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
|
||||
|
||||
|
||||
// align x to upper multiple of n
|
||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||
|
||||
// forward declaration
|
||||
// TODO: improve this later
|
||||
struct clip_ctx;
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_MINICPMV,
|
||||
PROJECTOR_TYPE_GLM_EDGE,
|
||||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_QWEN3VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
PROJECTOR_TYPE_IDEFICS3,
|
||||
PROJECTOR_TYPE_PIXTRAL,
|
||||
PROJECTOR_TYPE_QWEN25VL,
|
||||
PROJECTOR_TYPE_ULTRAVOX,
|
||||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_GLMA,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_MUSIC_FLAMINGO,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_KIMIVL,
|
||||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
PROJECTOR_TYPE_COGVLM,
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_MINICPMV, "resampler"},
|
||||
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
|
||||
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
|
||||
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
|
||||
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
|
||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
for (const auto & pair : PROJECTOR_TYPE_NAMES) {
|
||||
if (pair.second == str) {
|
||||
return pair.first;
|
||||
}
|
||||
}
|
||||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
};
|
||||
|
||||
// For images, buf.size() == nx*ny*3
|
||||
// Memory layout: RGBRGBRGB...
|
||||
// For audio, only one channel is used, buf.size() == nx*ny
|
||||
// nx will be n_frames and ny will be n_mel
|
||||
struct clip_image_f32 {
|
||||
int nx;
|
||||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
};
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
fputs(text, stderr);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
struct clip_logger_state {
|
||||
ggml_log_callback log_callback;
|
||||
void * log_callback_user_data;
|
||||
};
|
||||
|
||||
extern struct clip_logger_state g_logger_state;
|
||||
|
||||
static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
|
||||
if (format == NULL) {
|
||||
return;
|
||||
}
|
||||
va_list args_copy;
|
||||
va_copy(args_copy, args);
|
||||
char buffer[128];
|
||||
int len = vsnprintf(buffer, 128, format, args);
|
||||
if (len < 128) {
|
||||
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
||||
} else {
|
||||
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
|
||||
vsnprintf(buffer2, len + 1, format, args_copy);
|
||||
buffer2[len] = 0;
|
||||
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
||||
free(buffer2);
|
||||
}
|
||||
va_end(args_copy);
|
||||
}
|
||||
|
||||
static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
clip_log_internal_v(level, format, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// cpp wrappers
|
||||
//
|
||||
|
||||
// wrapper for clip_image_size
|
||||
struct clip_image_size_deleter {
|
||||
void operator()(clip_image_size * val) { clip_image_size_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
|
||||
|
||||
// wrapper for clip_image_u8
|
||||
struct clip_image_u8_deleter {
|
||||
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
|
||||
|
||||
// wrapper for clip_image_f32
|
||||
struct clip_image_f32_deleter {
|
||||
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
|
||||
};
|
||||
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
|
||||
|
||||
struct clip_image_u8_batch {
|
||||
std::vector<clip_image_u8_ptr> entries;
|
||||
};
|
||||
|
||||
struct clip_image_f32_batch {
|
||||
std::vector<clip_image_f32_ptr> entries;
|
||||
bool is_audio = false;
|
||||
|
||||
// for llava-uhd style models, we need to know the grid size
|
||||
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
|
||||
int grid_x = 0;
|
||||
int grid_y = 0;
|
||||
|
||||
clip_image_f32_batch clone() const {
|
||||
clip_image_f32_batch new_batch{
|
||||
/* entries */ {},
|
||||
/* is_audio */ is_audio,
|
||||
/* grid_x */ grid_x,
|
||||
/* grid_y */ grid_y,
|
||||
};
|
||||
new_batch.entries.reserve(entries.size());
|
||||
for (const auto & entry : entries) {
|
||||
new_batch.entries.emplace_back(new clip_image_f32(*entry));
|
||||
}
|
||||
return new_batch;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// common utils
|
||||
//
|
||||
|
||||
static std::string string_format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
GGML_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), buf.size());
|
||||
}
|
||||
|
||||
static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return;
|
||||
}
|
||||
std::string builder;
|
||||
builder.reserve(s.length());
|
||||
size_t pos = 0;
|
||||
size_t last_pos = 0;
|
||||
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||
builder.append(s, last_pos, pos - last_pos);
|
||||
builder.append(replace);
|
||||
last_pos = pos + search.length();
|
||||
}
|
||||
builder.append(s, last_pos, std::string::npos);
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
// split string by a `std::string delim` instead of `char delim`
|
||||
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
size_t pos = 0;
|
||||
std::string token;
|
||||
while ((pos = s.find(delimiter)) != std::string::npos) {
|
||||
token = s.substr(0, pos);
|
||||
tokens.push_back(token);
|
||||
s.erase(0, pos + delimiter.length());
|
||||
}
|
||||
tokens.push_back(s);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
//
|
||||
// gguf utils
|
||||
//
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||
default: return string_format("unknown type %d", type);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_STRING:
|
||||
return gguf_get_val_str(ctx_gguf, i);
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
if (arr_type == GGUF_TYPE_STRING) {
|
||||
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||
// escape quotes
|
||||
string_replace_all(val, "\\", "\\\\");
|
||||
string_replace_all(val, "\"", "\\\"");
|
||||
ss << '"' << val << '"';
|
||||
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||
ss << "???";
|
||||
} else {
|
||||
ss << gguf_data_to_str(arr_type, data, j);
|
||||
}
|
||||
if (j < arr_n - 1) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
default:
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// debugging
|
||||
//
|
||||
|
||||
static void print_tensor_shape(ggml_tensor * t) {
|
||||
printf("%s.shape = [", t->name);
|
||||
for (int i = 0; i < ggml_n_dims(t); ++i) {
|
||||
printf("%" PRId64, t->ne[i]);
|
||||
if (i < ggml_n_dims(t) - 1) {
|
||||
printf(", ");
|
||||
}
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
|
||||
ggml_type type = t->type;
|
||||
int64_t * ne = t->ne;
|
||||
size_t * nb = t->nb;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
printf("%s.data: [\n", t->name);
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2*n) {
|
||||
printf(" ..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
printf(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2*n) {
|
||||
printf(" ..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
printf(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2*n) {
|
||||
printf("..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
if (type == GGML_TYPE_F16) {
|
||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
v = (float) *(int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(int8_t *) &data[i];
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
printf("%8.4f", v);
|
||||
if (i0 < ne[0] - 1) printf(", ");
|
||||
}
|
||||
printf("],\n");
|
||||
}
|
||||
printf(" ],\n");
|
||||
}
|
||||
printf(" ]\n");
|
||||
}
|
||||
}
|
||||
|
||||
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
|
||||
|
||||
//
|
||||
// API used internally with mtmd
|
||||
//
|
||||
|
||||
projector_type clip_get_projector_type(const struct clip_ctx * ctx);
|
||||
@@ -0,0 +1,389 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip.h"
|
||||
#include "clip-impl.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
|
||||
enum ffn_op_type {
|
||||
FFN_GELU,
|
||||
FFN_GELU_ERF,
|
||||
FFN_SILU,
|
||||
FFN_GELU_QUICK,
|
||||
};
|
||||
|
||||
enum norm_type {
|
||||
NORM_TYPE_NORMAL,
|
||||
NORM_TYPE_RMS,
|
||||
};
|
||||
|
||||
enum patch_merge_type {
|
||||
PATCH_MERGE_FLAT,
|
||||
PATCH_MERGE_SPATIAL_UNPAD,
|
||||
};
|
||||
|
||||
struct clip_hparams {
|
||||
int32_t image_size = 0;
|
||||
int32_t patch_size = 0;
|
||||
int32_t n_embd = 0;
|
||||
int32_t n_ff = 0;
|
||||
int32_t projection_dim = 0;
|
||||
int32_t n_head = 0;
|
||||
int32_t n_layer = 0;
|
||||
// idefics3
|
||||
int32_t image_longest_edge = 0;
|
||||
int32_t image_min_pixels = -1;
|
||||
int32_t image_max_pixels = -1;
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
|
||||
// for models using dynamic image size, we need to have a smaller image size to warmup
|
||||
// otherwise, user will get OOM everytime they load the model
|
||||
int32_t warmup_image_size = 0;
|
||||
int32_t warmup_audio_size = 3000;
|
||||
|
||||
ffn_op_type ffn_op = FFN_GELU;
|
||||
|
||||
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
|
||||
|
||||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
|
||||
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
||||
|
||||
// audio
|
||||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
int32_t proj_stack_factor = 0; // ultravox
|
||||
|
||||
// audio-to-mel preprocessor params
|
||||
int32_t audio_chunk_len = -1; // in seconds
|
||||
int32_t audio_sample_rate = -1;
|
||||
int32_t audio_n_fft = -1;
|
||||
int32_t audio_window_len = -1;
|
||||
int32_t audio_hop_len = -1;
|
||||
|
||||
// legacy
|
||||
bool has_llava_projector = false;
|
||||
int minicpmv_version = 0;
|
||||
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
||||
|
||||
// custom value provided by user, can be undefined if not set
|
||||
int32_t custom_image_min_tokens = -1;
|
||||
int32_t custom_image_max_tokens = -1;
|
||||
|
||||
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
|
||||
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
|
||||
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
|
||||
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
|
||||
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
|
||||
}
|
||||
|
||||
void set_warmup_n_tokens(int n_tokens) {
|
||||
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
|
||||
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
|
||||
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
|
||||
// TODO: support warmup size for custom token numbers
|
||||
}
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
// attention
|
||||
ggml_tensor * k_w = nullptr;
|
||||
ggml_tensor * k_b = nullptr;
|
||||
ggml_tensor * q_w = nullptr;
|
||||
ggml_tensor * q_b = nullptr;
|
||||
ggml_tensor * v_w = nullptr;
|
||||
ggml_tensor * v_b = nullptr;
|
||||
ggml_tensor * qkv_w = nullptr;
|
||||
ggml_tensor * qkv_b = nullptr;
|
||||
|
||||
ggml_tensor * o_w = nullptr;
|
||||
ggml_tensor * o_b = nullptr;
|
||||
|
||||
ggml_tensor * k_norm = nullptr;
|
||||
ggml_tensor * q_norm = nullptr;
|
||||
|
||||
// layernorm 1
|
||||
ggml_tensor * ln_1_w = nullptr;
|
||||
ggml_tensor * ln_1_b = nullptr;
|
||||
|
||||
ggml_tensor * ff_up_w = nullptr;
|
||||
ggml_tensor * ff_up_b = nullptr;
|
||||
ggml_tensor * ff_gate_w = nullptr;
|
||||
ggml_tensor * ff_gate_b = nullptr;
|
||||
ggml_tensor * ff_down_w = nullptr;
|
||||
ggml_tensor * ff_down_b = nullptr;
|
||||
|
||||
// layernorm 2
|
||||
ggml_tensor * ln_2_w = nullptr;
|
||||
ggml_tensor * ln_2_b = nullptr;
|
||||
|
||||
// layer scale (no bias)
|
||||
ggml_tensor * ls_1_w = nullptr;
|
||||
ggml_tensor * ls_2_w = nullptr;
|
||||
|
||||
// qwen3vl deepstack merger
|
||||
ggml_tensor * deepstack_norm_w = nullptr;
|
||||
ggml_tensor * deepstack_norm_b = nullptr;
|
||||
ggml_tensor * deepstack_fc1_w = nullptr;
|
||||
ggml_tensor * deepstack_fc1_b = nullptr;
|
||||
ggml_tensor * deepstack_fc2_w = nullptr;
|
||||
ggml_tensor * deepstack_fc2_b = nullptr;
|
||||
|
||||
// lfm2
|
||||
ggml_tensor * ff_norm_w = nullptr;
|
||||
ggml_tensor * ff_norm_b = nullptr;
|
||||
ggml_tensor * ff_norm_1_w = nullptr;
|
||||
ggml_tensor * ff_norm_1_b = nullptr;
|
||||
ggml_tensor * ff_up_1_w = nullptr;
|
||||
ggml_tensor * ff_up_1_b = nullptr;
|
||||
ggml_tensor * ff_down_1_w = nullptr;
|
||||
ggml_tensor * ff_down_1_b = nullptr;
|
||||
ggml_tensor * pos_bias_u = nullptr;
|
||||
ggml_tensor * pos_bias_v = nullptr;
|
||||
ggml_tensor * norm_conv_w = nullptr;
|
||||
ggml_tensor * norm_conv_b = nullptr;
|
||||
ggml_tensor * linear_pos_w = nullptr;
|
||||
|
||||
ggml_tensor * conv_norm_w = nullptr;
|
||||
ggml_tensor * conv_norm_b = nullptr;
|
||||
ggml_tensor * conv_dw_w = nullptr;
|
||||
ggml_tensor * conv_dw_b = nullptr;
|
||||
ggml_tensor * conv_pw1_w = nullptr;
|
||||
ggml_tensor * conv_pw1_b = nullptr;
|
||||
ggml_tensor * conv_pw2_w = nullptr;
|
||||
ggml_tensor * conv_pw2_b = nullptr;
|
||||
|
||||
bool has_deepstack() const {
|
||||
return deepstack_fc1_w != nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
// Expanded MobileNetV5 block structure for Gemma3n vision encoder
|
||||
struct mobilenetv5_block {
|
||||
// Stage 0 (Edge Residual)
|
||||
ggml_tensor * s0_conv_exp_w = nullptr;
|
||||
ggml_tensor * s0_bn1_w = nullptr;
|
||||
ggml_tensor * s0_conv_pwl_w = nullptr;
|
||||
ggml_tensor * s0_bn2_w = nullptr;
|
||||
|
||||
// Stage 1+ (Universal Inverted Residual)
|
||||
ggml_tensor * dw_start_w = nullptr;
|
||||
ggml_tensor * dw_start_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_exp_w = nullptr;
|
||||
ggml_tensor * pw_exp_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * dw_mid_w = nullptr;
|
||||
ggml_tensor * dw_mid_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * pw_proj_w = nullptr;
|
||||
ggml_tensor * pw_proj_bn_w = nullptr;
|
||||
|
||||
ggml_tensor * layer_scale_w = nullptr;
|
||||
|
||||
// Attention (MQA) components
|
||||
ggml_tensor * attn_q_w = nullptr;
|
||||
ggml_tensor * attn_k_w = nullptr;
|
||||
ggml_tensor * attn_v_w = nullptr;
|
||||
ggml_tensor * attn_o_w = nullptr;
|
||||
|
||||
// Optional downsampling/norm in attention
|
||||
ggml_tensor * attn_k_dw_w = nullptr;
|
||||
ggml_tensor * attn_k_norm_w = nullptr;
|
||||
ggml_tensor * attn_v_dw_w = nullptr;
|
||||
ggml_tensor * attn_v_norm_w = nullptr;
|
||||
|
||||
// Block norm (often present in attention blocks)
|
||||
ggml_tensor * attn_norm_w = nullptr;
|
||||
};
|
||||
|
||||
struct clip_model {
|
||||
clip_modality modality = CLIP_MODALITY_VISION;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
clip_hparams hparams;
|
||||
|
||||
// embeddings
|
||||
ggml_tensor * class_embedding = nullptr;
|
||||
ggml_tensor * patch_embeddings_0 = nullptr;
|
||||
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
||||
ggml_tensor * patch_bias = nullptr;
|
||||
ggml_tensor * position_embeddings = nullptr;
|
||||
ggml_tensor * norm_embd_w = nullptr;
|
||||
ggml_tensor * norm_embd_b = nullptr;
|
||||
|
||||
ggml_tensor * pre_ln_w = nullptr;
|
||||
ggml_tensor * pre_ln_b = nullptr;
|
||||
|
||||
std::vector<clip_layer> layers;
|
||||
|
||||
int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
|
||||
|
||||
ggml_tensor * post_ln_w;
|
||||
ggml_tensor * post_ln_b;
|
||||
|
||||
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
||||
ggml_tensor * mm_fc_w;
|
||||
ggml_tensor * mm_fc_b;
|
||||
ggml_tensor * mm_ffn_up_w = nullptr;
|
||||
ggml_tensor * mm_ffn_up_b = nullptr;
|
||||
ggml_tensor * mm_ffn_gate_w = nullptr;
|
||||
ggml_tensor * mm_ffn_gate_b = nullptr;
|
||||
ggml_tensor * mm_ffn_down_w = nullptr;
|
||||
ggml_tensor * mm_ffn_down_b = nullptr;
|
||||
ggml_tensor * mm_post_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_norm_b = nullptr;
|
||||
|
||||
// LLaVA projection
|
||||
ggml_tensor * mm_input_norm_w = nullptr;
|
||||
ggml_tensor * mm_input_norm_b = nullptr;
|
||||
ggml_tensor * mm_0_w = nullptr;
|
||||
ggml_tensor * mm_0_b = nullptr;
|
||||
ggml_tensor * mm_2_w = nullptr;
|
||||
ggml_tensor * mm_2_b = nullptr;
|
||||
|
||||
ggml_tensor * image_newline = nullptr;
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
|
||||
ggml_tensor * mm_1_b = nullptr;
|
||||
ggml_tensor * mm_3_w = nullptr;
|
||||
ggml_tensor * mm_3_b = nullptr;
|
||||
ggml_tensor * mm_4_w = nullptr;
|
||||
ggml_tensor * mm_4_b = nullptr;
|
||||
|
||||
// GLMV-Edge projection
|
||||
ggml_tensor * mm_model_adapter_conv_w = nullptr;
|
||||
ggml_tensor * mm_model_adapter_conv_b = nullptr;
|
||||
|
||||
// MobileVLM projection
|
||||
ggml_tensor * mm_model_mlp_1_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_1_b = nullptr;
|
||||
ggml_tensor * mm_model_mlp_3_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_3_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
|
||||
ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
|
||||
|
||||
// MobileVLM_V2 projection
|
||||
ggml_tensor * mm_model_mlp_0_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_0_b = nullptr;
|
||||
ggml_tensor * mm_model_mlp_2_w = nullptr;
|
||||
ggml_tensor * mm_model_mlp_2_b = nullptr;
|
||||
ggml_tensor * mm_model_peg_0_w = nullptr;
|
||||
ggml_tensor * mm_model_peg_0_b = nullptr;
|
||||
|
||||
// MINICPMV projection
|
||||
ggml_tensor * mm_model_pos_embed_k = nullptr;
|
||||
ggml_tensor * mm_model_query = nullptr;
|
||||
ggml_tensor * mm_model_proj = nullptr;
|
||||
ggml_tensor * mm_model_kv_proj = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_b = nullptr;
|
||||
ggml_tensor * mm_model_attn_k_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_k_b = nullptr;
|
||||
ggml_tensor * mm_model_attn_v_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_v_b = nullptr;
|
||||
ggml_tensor * mm_model_attn_o_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_o_b = nullptr;
|
||||
ggml_tensor * mm_model_ln_q_w = nullptr;
|
||||
ggml_tensor * mm_model_ln_q_b = nullptr;
|
||||
ggml_tensor * mm_model_ln_kv_w = nullptr;
|
||||
ggml_tensor * mm_model_ln_kv_b = nullptr;
|
||||
ggml_tensor * mm_model_ln_post_w = nullptr;
|
||||
ggml_tensor * mm_model_ln_post_b = nullptr;
|
||||
|
||||
// gemma3
|
||||
ggml_tensor * mm_input_proj_w = nullptr;
|
||||
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||
|
||||
// mobilenetv5 for gemma3n
|
||||
std::vector<mobilenetv5_block> mobilenet_blocks;
|
||||
std::vector<int> mobilenet_stage_ends;
|
||||
ggml_tensor * mobilenet_stem_conv_w = nullptr;
|
||||
ggml_tensor * mobilenet_stem_conv_b = nullptr;
|
||||
ggml_tensor * mobilenet_stem_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_proj_norm_w = nullptr;
|
||||
|
||||
// Multi-Scale Fusion Adapter (MSFA) components
|
||||
ggml_tensor * msfa_concat_conv_w = nullptr;
|
||||
ggml_tensor * msfa_concat_norm_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_w = nullptr;
|
||||
ggml_tensor * msfa_ffn_expand_bn = nullptr;
|
||||
ggml_tensor * msfa_ffn_project_bn = nullptr;
|
||||
|
||||
|
||||
// pixtral, glm4v
|
||||
ggml_tensor * token_embd_img_break = nullptr;
|
||||
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||
ggml_tensor * mm_patch_merger_b = nullptr;
|
||||
|
||||
// ultravox / whisper encoder
|
||||
ggml_tensor * conv1d_1_w = nullptr;
|
||||
ggml_tensor * conv1d_1_b = nullptr;
|
||||
ggml_tensor * conv1d_2_w = nullptr;
|
||||
ggml_tensor * conv1d_2_b = nullptr;
|
||||
ggml_tensor * mm_norm_pre_w = nullptr;
|
||||
ggml_tensor * mm_norm_pre_b = nullptr;
|
||||
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||
|
||||
// cogvlm
|
||||
ggml_tensor * mm_post_fc_norm_w = nullptr;
|
||||
ggml_tensor * mm_post_fc_norm_b = nullptr;
|
||||
ggml_tensor * mm_h_to_4h_w = nullptr;
|
||||
ggml_tensor * mm_gate_w = nullptr;
|
||||
ggml_tensor * mm_4h_to_h_w = nullptr;
|
||||
ggml_tensor * mm_boi = nullptr;
|
||||
ggml_tensor * mm_eoi = nullptr;
|
||||
|
||||
// lfm2 audio
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
|
||||
ggml_tensor * pre_encode_out_w = nullptr;
|
||||
ggml_tensor * pre_encode_out_b = nullptr;
|
||||
|
||||
bool audio_has_avgpool() const {
|
||||
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL
|
||||
|| proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO;
|
||||
}
|
||||
|
||||
bool audio_has_stack_frames() const {
|
||||
return proj_type == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
};
|
||||
|
||||
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,119 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// !!! Internal header, to be used by mtmd only !!!
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
struct clip_ctx;
|
||||
|
||||
struct clip_image_size {
|
||||
int width;
|
||||
int height;
|
||||
};
|
||||
|
||||
struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
enum clip_modality {
|
||||
CLIP_MODALITY_VISION,
|
||||
CLIP_MODALITY_AUDIO,
|
||||
};
|
||||
|
||||
enum clip_flash_attn_type {
|
||||
CLIP_FLASH_ATTN_TYPE_AUTO = -1,
|
||||
CLIP_FLASH_ATTN_TYPE_DISABLED = 0,
|
||||
CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
|
||||
};
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
enum clip_flash_attn_type flash_attn_type;
|
||||
int image_min_tokens;
|
||||
int image_max_tokens;
|
||||
bool warmup;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
struct clip_ctx * ctx_v; // vision context
|
||||
struct clip_ctx * ctx_a; // audio context
|
||||
};
|
||||
|
||||
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
|
||||
|
||||
int32_t clip_get_image_size (const struct clip_ctx * ctx);
|
||||
int32_t clip_get_patch_size (const struct clip_ctx * ctx);
|
||||
int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
||||
|
||||
// TODO: should be enum, not string
|
||||
const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||
|
||||
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// for M-RoPE, this will be the number of token positions in X and Y directions
|
||||
// for other models, X will be the total number of tokens and Y will be 1
|
||||
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
|
||||
// this should be equal to the embedding dimension of the text model
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
|
||||
struct clip_image_size * clip_image_size_init(void);
|
||||
struct clip_image_u8 * clip_image_u8_init (void);
|
||||
struct clip_image_f32 * clip_image_f32_init(void);
|
||||
struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
|
||||
|
||||
// nx, ny are the output image dimensions
|
||||
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
|
||||
|
||||
void clip_image_size_free (struct clip_image_size * img_size);
|
||||
void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
// use for accessing underlay data of clip_image_f32_batch
|
||||
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
||||
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
||||
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
/**
|
||||
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
||||
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
|
||||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
// note for contributor: this clip_is_(model) pattern is deprecated
|
||||
// do NOT add new functions like this
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
// use by audio input
|
||||
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
|
||||
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
|
||||
@@ -0,0 +1,22 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::string filename = "main";
|
||||
if (argc >= 1) {
|
||||
filename = argv[0];
|
||||
}
|
||||
|
||||
// Get only the program name from the full path
|
||||
size_t pos = filename.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
filename = filename.substr(pos+1);
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||
fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
@@ -0,0 +1,730 @@
|
||||
#include "mtmd-audio.h"
|
||||
|
||||
#define _USE_MATH_DEFINES // for M_PI
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
|
||||
// some of the code here is copied from whisper.cpp
|
||||
|
||||
constexpr bool DEBUG = false;
|
||||
|
||||
void mtmd_audio_cache::fill_sin_cos_table(int n) {
|
||||
sin_vals.resize(n);
|
||||
cos_vals.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
double theta = (2 * M_PI * i) / n;
|
||||
sin_vals[i] = sinf(theta);
|
||||
cos_vals[i] = cosf(theta);
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
|
||||
hann_window.resize(length);
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
int sample_rate,
|
||||
float fmin,
|
||||
float fmax,
|
||||
bool slaney_area_norm,
|
||||
float scale) {
|
||||
GGML_ASSERT(n_mel > 0 && n_fft > 1);
|
||||
if (fmax <= 0.0f) {
|
||||
fmax = 0.5f * sample_rate;
|
||||
}
|
||||
|
||||
// Slaney scale (matches librosa default)
|
||||
const double min_log_hz = 1000.0;
|
||||
const double lin_slope = 3 / 200.;
|
||||
const double min_log_mel = min_log_hz * lin_slope;
|
||||
const double log_step = log(6.4) / 27.0;
|
||||
auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double {
|
||||
return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step;
|
||||
};
|
||||
auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double {
|
||||
return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step);
|
||||
};
|
||||
|
||||
// infer N_fft from n_fft_bins
|
||||
const double bin_hz_step = double(sample_rate) / double(n_fft);
|
||||
|
||||
// mel grid: n_mel + 2 edges
|
||||
const double m_lo = hz_to_mel(fmin);
|
||||
const double m_hi = hz_to_mel(fmax);
|
||||
std::vector<double> mel_pts(n_mel + 2);
|
||||
for (int i = 0; i < n_mel + 2; ++i) {
|
||||
mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1));
|
||||
}
|
||||
|
||||
// convert to Hz
|
||||
std::vector<double> hz_pts(n_mel + 2);
|
||||
for (int i = 0; i < n_mel + 2; ++i) {
|
||||
hz_pts[i] = mel_to_hz(mel_pts[i]);
|
||||
}
|
||||
|
||||
const int n_fft_bins = n_fft / 2 + 1;
|
||||
|
||||
// filterbank
|
||||
std::vector<float> out(n_mel * n_fft_bins, 0);
|
||||
for (int m = 0; m < n_mel; ++m) {
|
||||
const double f_left = hz_pts[m];
|
||||
const double f_center = hz_pts[m + 1];
|
||||
const double f_right = hz_pts[m + 2];
|
||||
|
||||
const double denom_l = std::max(1e-30, f_center - f_left);
|
||||
const double denom_r = std::max(1e-30, f_right - f_center);
|
||||
const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0;
|
||||
|
||||
for (int k = 0; k < n_fft_bins; ++k) {
|
||||
const double f = k * bin_hz_step;
|
||||
double w = 0.0;
|
||||
if (f >= f_left && f <= f_center) {
|
||||
w = (f - f_left) / denom_l;
|
||||
} else if (f > f_center && f <= f_right) {
|
||||
w = (f_right - f) / denom_r;
|
||||
}
|
||||
out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale);
|
||||
}
|
||||
}
|
||||
|
||||
filters.n_mel = n_mel;
|
||||
filters.n_fft = n_fft;
|
||||
filters.data = std::move(out);
|
||||
|
||||
if (DEBUG) { // debug
|
||||
for (size_t i = 0; i < filters.data.size(); ++i) {
|
||||
if (filters.data[i] != 0.0f) {
|
||||
printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Unified DFT implementation for both forward and inverse transforms
|
||||
// Template parameters:
|
||||
// Inverse: false = DFT with exp(-2πi·k·n/N), no scaling
|
||||
// true = IDFT with exp(+2πi·k·n/N), scales by 1/N
|
||||
// RealInput: true = input is real-valued (stride 1), avoids imaginary computations
|
||||
// false = input is complex-valued (interleaved real/imag, stride 2)
|
||||
template <bool Inverse, bool RealInput>
|
||||
static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) {
|
||||
const int n_sin_cos_vals = cache.sin_vals.size();
|
||||
const int sin_cos_step = n_sin_cos_vals / N;
|
||||
|
||||
constexpr float sign = Inverse ? 1.0f : -1.0f;
|
||||
const float scale = Inverse ? (1.0f / N) : 1.0f;
|
||||
|
||||
for (int k = 0; k < N; k++) {
|
||||
float re = 0;
|
||||
float im = 0;
|
||||
|
||||
for (int n = 0; n < N; n++) {
|
||||
int idx = (k * n * sin_cos_step) % n_sin_cos_vals;
|
||||
float cos_val = cache.cos_vals[idx];
|
||||
float sin_val = cache.sin_vals[idx];
|
||||
|
||||
if constexpr (RealInput) {
|
||||
// Real input: in_im = 0, simplifies to:
|
||||
// re += in_re * cos_val
|
||||
// im += sign * in_re * sin_val
|
||||
float in_re = in[n];
|
||||
re += in_re * cos_val;
|
||||
im += sign * in_re * sin_val;
|
||||
} else {
|
||||
float in_re = in[n * 2 + 0];
|
||||
float in_im = in[n * 2 + 1];
|
||||
// (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i
|
||||
re += in_re * cos_val - sign * in_im * sin_val;
|
||||
im += sign * in_re * sin_val + in_im * cos_val;
|
||||
}
|
||||
}
|
||||
|
||||
out[k * 2 + 0] = re * scale;
|
||||
out[k * 2 + 1] = im * scale;
|
||||
}
|
||||
}
|
||||
|
||||
// Cooley-Tukey FFT/IFFT unified implementation
|
||||
// Template parameters:
|
||||
// Inverse: false = FFT with exp(-2πi·k/N), no scaling
|
||||
// true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level
|
||||
// RealInput: true = input is real-valued (stride 1)
|
||||
// false = input is complex-valued (interleaved real/imag, stride 2)
|
||||
template <bool Inverse, bool RealInput>
|
||||
static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
const int n_sin_cos_vals = cache.sin_vals.size();
|
||||
|
||||
if (N == 1) {
|
||||
out[0] = in[0];
|
||||
if constexpr (RealInput) {
|
||||
out[1] = 0.0f;
|
||||
} else {
|
||||
out[1] = in[1];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const int half_N = N / 2;
|
||||
if (N - half_N * 2 == 1) {
|
||||
// Odd N: fall back to DFT
|
||||
dft_impl<Inverse, RealInput>(cache, in, N, out);
|
||||
return;
|
||||
}
|
||||
|
||||
// Split into even and odd
|
||||
if constexpr (RealInput) {
|
||||
// Real input: stride is 1, copy only real values
|
||||
float * even = in + N;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
even[i] = in[2 * i];
|
||||
}
|
||||
float * even_fft = out + 2 * N;
|
||||
fft_impl<Inverse, true>(cache, even, half_N, even_fft);
|
||||
|
||||
float * odd = even;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
odd[i] = in[2 * i + 1];
|
||||
}
|
||||
float * odd_fft = even_fft + N;
|
||||
fft_impl<Inverse, true>(cache, odd, half_N, odd_fft);
|
||||
} else {
|
||||
// Complex input: stride is 2, copy complex pairs
|
||||
float * even = in + N * 2;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
even[i * 2 + 0] = in[2 * i * 2 + 0];
|
||||
even[i * 2 + 1] = in[2 * i * 2 + 1];
|
||||
}
|
||||
float * even_fft = out + 2 * N;
|
||||
fft_impl<Inverse, false>(cache, even, half_N, even_fft);
|
||||
|
||||
float * odd = even;
|
||||
for (int i = 0; i < half_N; ++i) {
|
||||
odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0];
|
||||
odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1];
|
||||
}
|
||||
float * odd_fft = even_fft + N;
|
||||
fft_impl<Inverse, false>(cache, odd, half_N, odd_fft);
|
||||
}
|
||||
|
||||
float * even_fft = out + 2 * N;
|
||||
float * odd_fft = even_fft + N;
|
||||
|
||||
const int sin_cos_step = n_sin_cos_vals / N;
|
||||
|
||||
constexpr float sign = Inverse ? 1.0f : -1.0f;
|
||||
constexpr float scale = Inverse ? 0.5f : 1.0f;
|
||||
|
||||
for (int k = 0; k < half_N; k++) {
|
||||
int idx = k * sin_cos_step; // t = 2*M_PI*k/N
|
||||
float re = cache.cos_vals[idx];
|
||||
float im = sign * cache.sin_vals[idx];
|
||||
|
||||
float re_odd = odd_fft[2 * k + 0];
|
||||
float im_odd = odd_fft[2 * k + 1];
|
||||
|
||||
out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd);
|
||||
out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd);
|
||||
|
||||
out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd);
|
||||
out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd);
|
||||
}
|
||||
}
|
||||
|
||||
// Forward FFT for real input (used by mel spectrogram)
|
||||
static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
fft_impl<false, true>(cache, in, N, out);
|
||||
}
|
||||
|
||||
// Inverse FFT for complex input
|
||||
static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
fft_impl<true, false>(cache, in, N, out);
|
||||
}
|
||||
|
||||
struct filter_params {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft_bins;
|
||||
int32_t hann_window_size;
|
||||
int32_t hop_length;
|
||||
int32_t sample_rate;
|
||||
bool center_padding = false;
|
||||
float preemph = 0.f;
|
||||
bool use_natural_log = false;
|
||||
bool norm_per_feature = false;
|
||||
};
|
||||
|
||||
static void log_mel_spectrogram_worker_thread(int ith,
|
||||
const float * hann,
|
||||
const std::vector<float> & samples,
|
||||
int n_samples,
|
||||
int frame_size,
|
||||
int frame_step,
|
||||
int n_threads,
|
||||
const filter_params & params,
|
||||
const mtmd_audio_cache & cache,
|
||||
mtmd_audio_mel & out) {
|
||||
std::vector<float> fft_in(frame_size * 2, 0.0);
|
||||
std::vector<float> fft_out(frame_size * 2 * 2 * 2);
|
||||
|
||||
int n_fft_bins = params.n_fft_bins;
|
||||
int i = ith;
|
||||
|
||||
const auto & filters = cache.filters;
|
||||
|
||||
// make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
|
||||
GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
|
||||
GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
|
||||
// calculate FFT only when fft_in are not all zero
|
||||
for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
|
||||
const int offset = i * frame_step;
|
||||
|
||||
// apply Hann window (~10% faster)
|
||||
for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
|
||||
fft_in[j] = hann[j] * samples[offset + j];
|
||||
}
|
||||
|
||||
// fill the rest with zeros
|
||||
if (n_samples - offset < frame_size) {
|
||||
std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
|
||||
}
|
||||
|
||||
// FFT
|
||||
fft(cache, fft_in.data(), frame_size, fft_out.data());
|
||||
|
||||
// Calculate modulus^2 of complex numbers
|
||||
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
|
||||
for (int j = 0; j < n_fft_bins; j++) {
|
||||
fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
|
||||
}
|
||||
|
||||
// mel spectrogram
|
||||
for (int j = 0; j < out.n_mel; j++) {
|
||||
double sum = 0.0;
|
||||
// unroll loop (suggested by GH user @lunixbochs)
|
||||
int k = 0;
|
||||
for (k = 0; k < n_fft_bins - 3; k += 4) {
|
||||
size_t idx = size_t(j) * size_t(n_fft_bins) + size_t(k);
|
||||
sum +=
|
||||
fft_out[k + 0] * filters.data[idx + 0] +
|
||||
fft_out[k + 1] * filters.data[idx + 1] +
|
||||
fft_out[k + 2] * filters.data[idx + 2] +
|
||||
fft_out[k + 3] * filters.data[idx + 3];
|
||||
}
|
||||
// handle n_fft remainder
|
||||
for (; k < n_fft_bins; k++) {
|
||||
sum += fft_out[k] * filters.data[j * n_fft_bins + k];
|
||||
}
|
||||
sum = params.use_natural_log
|
||||
? log(sum + 5.960464477539063e-08)
|
||||
: log10(std::max(sum, 1e-10));
|
||||
out.data[j * out.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise fft_out are all zero
|
||||
double sum = params.use_natural_log ? log(1e-10) : log10(1e-10);
|
||||
for (; i < out.n_len; i += n_threads) {
|
||||
for (int j = 0; j < out.n_mel; j++) {
|
||||
out.data[j * out.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
|
||||
static bool log_mel_spectrogram(
|
||||
const float * samples,
|
||||
const int n_samples_in,
|
||||
const int n_threads,
|
||||
const filter_params & params,
|
||||
const mtmd_audio_cache & cache,
|
||||
mtmd_audio_mel & out) {
|
||||
//const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
out.n_len_org = n_samples_in;
|
||||
int n_samples = n_samples_in;
|
||||
|
||||
// Hann window
|
||||
const float * hann = cache.hann_window.data();
|
||||
const int frame_size = (params.n_fft_bins - 1) * 2;
|
||||
const int frame_step = params.hop_length;
|
||||
|
||||
// Padding
|
||||
std::vector<float> samples_padded;
|
||||
if (params.center_padding) {
|
||||
const auto pad_amount = frame_size / 2;
|
||||
samples_padded = std::vector<float>(n_samples + 2 * pad_amount, 0);
|
||||
std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount);
|
||||
samples = samples_padded.data();
|
||||
n_samples = samples_padded.size();
|
||||
} else {
|
||||
// existing padding logic
|
||||
int64_t stage_1_pad = params.sample_rate * 30;
|
||||
int64_t stage_2_pad = frame_size / 2;
|
||||
samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
|
||||
std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
|
||||
// pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
|
||||
std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
|
||||
// reflective pad 200 samples at the beginning of audio
|
||||
if (n_samples < stage_2_pad + 1) {
|
||||
// TODO: Handle short audio differently or return error
|
||||
return false;
|
||||
}
|
||||
std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
|
||||
}
|
||||
|
||||
// preemphasis
|
||||
if (params.preemph) {
|
||||
const int pad_amount = frame_size / 2;
|
||||
const float preemph = 0.97f;
|
||||
float prev = samples_padded[pad_amount];
|
||||
for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) {
|
||||
float cur = samples_padded[i];
|
||||
samples_padded[i] = cur - preemph * prev;
|
||||
prev = cur;
|
||||
}
|
||||
}
|
||||
|
||||
// pad hann window if it's smaller than frame_size
|
||||
// TODO: probably unnecessary here? (or better doing it in g_cache?)
|
||||
std::vector<float> hann_window_padded;
|
||||
if (params.hann_window_size < frame_size) {
|
||||
hann_window_padded.resize(frame_size);
|
||||
const int padding = (frame_size - params.hann_window_size) / 2;
|
||||
std::copy(hann, hann + params.hann_window_size, &hann_window_padded[padding]);
|
||||
hann = hann_window_padded.data();
|
||||
}
|
||||
|
||||
|
||||
out.n_mel = params.n_mel;
|
||||
out.n_len = (n_samples - frame_size) / frame_step + 1;
|
||||
// TODO: handle these checks better
|
||||
if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) {
|
||||
LOG_ERR("%s: size overflow\n", __func__);
|
||||
return false;
|
||||
}
|
||||
if (n_samples < frame_size) {
|
||||
LOG_ERR("%s: not enough samples after padding\n", __func__);
|
||||
return false;
|
||||
}
|
||||
out.data.resize(out.n_mel * out.n_len);
|
||||
|
||||
{
|
||||
std::vector<std::thread> workers(n_threads - 1);
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw] =
|
||||
std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples,
|
||||
frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out));
|
||||
}
|
||||
|
||||
// main thread
|
||||
log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params,
|
||||
cache, out);
|
||||
for (int iw = 0; iw < n_threads - 1; ++iw) {
|
||||
workers[iw].join();
|
||||
}
|
||||
}
|
||||
|
||||
const int effective_n_len = n_samples_in / frame_step;
|
||||
if (params.norm_per_feature) {
|
||||
for (int i = 0; i < out.n_mel; i++) {
|
||||
double mean = 0;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
mean += out.data[i * out.n_len + j];
|
||||
}
|
||||
mean /= effective_n_len;
|
||||
|
||||
double var = 0.0;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
const double value = out.data[i * out.n_len + j] - mean;
|
||||
var += value * value;
|
||||
}
|
||||
var /= effective_n_len - 1; // unbiased
|
||||
const double mstd = std::sqrt(var + 1e-5);
|
||||
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
auto &value = out.data[i * out.n_len + j];
|
||||
value = (value - mean) / mstd;
|
||||
}
|
||||
|
||||
// pad the rest with zeros
|
||||
for (int j = effective_n_len; j < out.n_len; ++j) {
|
||||
out.data[i * out.n_len + j] = 0.0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// clamping and normalization
|
||||
double mmax = -1e20;
|
||||
for (int i = 0; i < out.n_mel*out.n_len; i++) {
|
||||
if (out.data[i] > mmax) {
|
||||
mmax = out.data[i];
|
||||
}
|
||||
}
|
||||
|
||||
mmax -= 8.0;
|
||||
|
||||
for (int i = 0; i < out.n_mel*out.n_len; i++) {
|
||||
if (out.data[i] < mmax) {
|
||||
out.data[i] = mmax;
|
||||
}
|
||||
out.data[i] = (out.data[i] + 4.0)/4.0;
|
||||
}
|
||||
}
|
||||
|
||||
// Dump log_mel_spectrogram
|
||||
if (DEBUG) {
|
||||
std::ofstream outFile("log_mel_spectrogram.json");
|
||||
outFile << "[";
|
||||
for (uint64_t i = 0; i < out.data.size() - 1; i++) {
|
||||
outFile << out.data[i] << ", ";
|
||||
}
|
||||
outFile << out.data[out.data.size() - 1] << "]";
|
||||
outFile.close();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_preprocessor_whisper
|
||||
//
|
||||
|
||||
void mtmd_audio_preprocessor_whisper::initialize() {
|
||||
cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
cache.fill_hann_window(hparams.audio_window_len, true);
|
||||
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
|
||||
}
|
||||
|
||||
bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
if (n_samples == 0) {
|
||||
// empty audio
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<float> smpl;
|
||||
// if input is too short, pad with zeros
|
||||
// this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram
|
||||
// TODO: maybe handle this better
|
||||
size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin
|
||||
if (n_samples < min_samples) {
|
||||
smpl.resize(min_samples, 0.0f);
|
||||
std::memcpy(smpl.data(), samples, n_samples * sizeof(float));
|
||||
samples = smpl.data();
|
||||
n_samples = smpl.size();
|
||||
}
|
||||
|
||||
filter_params params;
|
||||
params.n_mel = hparams.n_mel_bins;
|
||||
params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
|
||||
params.hann_window_size = hparams.audio_window_len;
|
||||
params.hop_length = hparams.audio_hop_len;
|
||||
params.sample_rate = hparams.audio_sample_rate;
|
||||
params.center_padding = false;
|
||||
params.preemph = 0.0f; // disabled
|
||||
params.use_natural_log = false;
|
||||
params.norm_per_feature = false;
|
||||
|
||||
// make sure the cache is initialized
|
||||
GGML_ASSERT(!cache.sin_vals.empty());
|
||||
GGML_ASSERT(!cache.cos_vals.empty());
|
||||
GGML_ASSERT(!cache.filters.data.empty());
|
||||
|
||||
mtmd_audio_mel out_full;
|
||||
bool ok = log_mel_spectrogram(samples, n_samples,
|
||||
4, // n_threads
|
||||
params, cache, out_full);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
|
||||
// we always expect the mel to have 3000 silent frames at the end
|
||||
if (DEBUG) {
|
||||
printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
|
||||
}
|
||||
const size_t frames_per_chunk = 3000;
|
||||
GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
|
||||
for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
|
||||
int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
|
||||
if ((size_t) n_len < frames_per_chunk) {
|
||||
break; // last uncomplete chunk will always be a padded chunk, safe to ignore
|
||||
}
|
||||
|
||||
mtmd_audio_mel out_chunk;
|
||||
out_chunk.n_len = n_len;
|
||||
out_chunk.n_mel = out_full.n_mel;
|
||||
out_chunk.n_len_org = out_full.n_mel; // unused
|
||||
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
|
||||
|
||||
for (int i = 0; i < out_full.n_mel; i++) {
|
||||
auto src = out_full.data.begin() + i * out_full.n_len + off;
|
||||
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
|
||||
}
|
||||
|
||||
output.push_back(std::move(out_chunk));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_preprocessor_conformer
|
||||
//
|
||||
|
||||
void mtmd_audio_preprocessor_conformer::initialize() {
|
||||
cache.fill_sin_cos_table(hparams.audio_n_fft);
|
||||
cache.fill_hann_window(hparams.audio_window_len, true);
|
||||
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
|
||||
}
|
||||
|
||||
bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples,
|
||||
size_t n_samples,
|
||||
std::vector<mtmd_audio_mel> & output) {
|
||||
// empty audio
|
||||
if (n_samples == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
filter_params params;
|
||||
params.n_mel = hparams.n_mel_bins;
|
||||
params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
|
||||
params.hann_window_size = hparams.audio_window_len;
|
||||
params.hop_length = hparams.audio_hop_len;
|
||||
params.sample_rate = hparams.audio_sample_rate;
|
||||
params.center_padding = true;
|
||||
params.preemph = 0.97f;
|
||||
params.use_natural_log = true;
|
||||
params.norm_per_feature = true;
|
||||
|
||||
// make sure the cache is initialized
|
||||
GGML_ASSERT(!cache.sin_vals.empty());
|
||||
GGML_ASSERT(!cache.cos_vals.empty());
|
||||
GGML_ASSERT(!cache.filters.data.empty());
|
||||
|
||||
mtmd_audio_mel out_full;
|
||||
bool ok = log_mel_spectrogram(samples, n_samples,
|
||||
4, // n_threads
|
||||
params, cache, out_full);
|
||||
if (!ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
output.push_back(std::move(out_full));
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_audio_streaming_istft implementation
|
||||
//
|
||||
|
||||
mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) :
|
||||
n_fft(n_fft),
|
||||
hop_length(hop_length),
|
||||
n_fft_bins(n_fft / 2 + 1),
|
||||
overlap_buffer(n_fft, 0.0f),
|
||||
window_sum_buffer(n_fft, 0.0f),
|
||||
padding_to_remove((n_fft - hop_length) / 2),
|
||||
ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
|
||||
ifft_out(n_fft * 2 * 4, 0.0f) {
|
||||
cache.fill_sin_cos_table(n_fft);
|
||||
cache.fill_hann_window(n_fft, true);
|
||||
}
|
||||
|
||||
void mtmd_audio_streaming_istft::reset() {
|
||||
std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f);
|
||||
std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f);
|
||||
padding_to_remove = (n_fft - hop_length) / 2;
|
||||
}
|
||||
|
||||
std::vector<float> mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) {
|
||||
std::vector<float> output(hop_length);
|
||||
|
||||
// copy frequencies
|
||||
for (int j = 0; j < n_fft_bins; j++) {
|
||||
ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0];
|
||||
ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1];
|
||||
}
|
||||
|
||||
// mirror negative frequencies
|
||||
for (int j = 1; j < n_fft_bins - 1; j++) {
|
||||
int mirror_idx = n_fft - j;
|
||||
ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0];
|
||||
ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate
|
||||
}
|
||||
|
||||
ifft(cache, ifft_in.data(), n_fft, ifft_out.data());
|
||||
|
||||
// update window sum and overlap buffer
|
||||
for (int j = 0; j < n_fft; j++) {
|
||||
window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j];
|
||||
overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j];
|
||||
}
|
||||
|
||||
// extract hop_length samples with normalization
|
||||
for (int i = 0; i < hop_length; i++) {
|
||||
if (window_sum_buffer[i] > 1e-8f) {
|
||||
output[i] = overlap_buffer[i] / window_sum_buffer[i];
|
||||
} else {
|
||||
output[i] = overlap_buffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
// shift buffers left by hop_length
|
||||
std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin());
|
||||
std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f);
|
||||
|
||||
std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin());
|
||||
std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f);
|
||||
|
||||
// Remove padding if needed
|
||||
int to_remove = std::min(padding_to_remove, (int) output.size());
|
||||
padding_to_remove -= to_remove;
|
||||
output.erase(output.begin(), output.begin() + to_remove);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
std::vector<float> mtmd_audio_streaming_istft::flush() {
|
||||
std::vector<float> output;
|
||||
|
||||
// Extract remaining samples from overlap buffer
|
||||
// Continue until we've extracted all meaningful samples
|
||||
int remaining = n_fft - hop_length;
|
||||
while (remaining > 0) {
|
||||
int chunk_size = std::min(remaining, hop_length);
|
||||
|
||||
for (int i = 0; i < chunk_size; i++) {
|
||||
float sample;
|
||||
if (window_sum_buffer[i] > 1e-8f) {
|
||||
sample = overlap_buffer[i] / window_sum_buffer[i];
|
||||
} else {
|
||||
sample = overlap_buffer[i];
|
||||
}
|
||||
output.push_back(sample);
|
||||
}
|
||||
|
||||
// Shift buffers
|
||||
std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin());
|
||||
std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f);
|
||||
|
||||
std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin());
|
||||
std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f);
|
||||
|
||||
remaining -= chunk_size;
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
struct mtmd_audio_mel {
|
||||
int n_len;
|
||||
int n_len_org;
|
||||
int n_mel;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct mtmd_audio_mel_filters {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
// cache for audio processing, each processor instance owns its own cache
|
||||
struct mtmd_audio_cache {
|
||||
std::vector<float> sin_vals;
|
||||
std::vector<float> cos_vals;
|
||||
|
||||
std::vector<float> hann_window;
|
||||
|
||||
mtmd_audio_mel_filters filters;
|
||||
|
||||
void fill_sin_cos_table(int n);
|
||||
|
||||
void fill_hann_window(int length, bool periodic);
|
||||
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
void fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
int sample_rate, // e.g. 16000
|
||||
float fmin = 0.0f, // e.g. 0.0
|
||||
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
|
||||
bool slaney_area_norm = true,
|
||||
float scale = 1.0f // optional extra scaling
|
||||
);
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor {
|
||||
const clip_hparams & hparams;
|
||||
|
||||
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||
|
||||
virtual ~mtmd_audio_preprocessor() = default;
|
||||
virtual void initialize() = 0; // NOT thread-safe
|
||||
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
|
||||
private:
|
||||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
|
||||
private:
|
||||
mtmd_audio_cache cache;
|
||||
};
|
||||
|
||||
//
|
||||
// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
|
||||
//
|
||||
struct mtmd_audio_streaming_istft {
|
||||
mtmd_audio_streaming_istft(int n_fft, int hop_length);
|
||||
|
||||
// reset streaming state
|
||||
void reset();
|
||||
|
||||
// process a single STFT frame (streaming)
|
||||
// frame_spectrum: [n_fft_bins x 2] interleaved real/imag
|
||||
// returns: up to hop_length samples
|
||||
std::vector<float> process_frame(const float * frame_spectrum);
|
||||
|
||||
// flush remaining samples at end of stream
|
||||
std::vector<float> flush();
|
||||
|
||||
private:
|
||||
int n_fft;
|
||||
int hop_length;
|
||||
int n_fft_bins;
|
||||
|
||||
// Own cache for output processing
|
||||
mtmd_audio_cache cache;
|
||||
|
||||
// Streaming state
|
||||
std::vector<float> overlap_buffer;
|
||||
std::vector<float> window_sum_buffer;
|
||||
int padding_to_remove;
|
||||
|
||||
// Working buffers for IFFT
|
||||
std::vector<float> ifft_in;
|
||||
std::vector<float> ifft_out;
|
||||
};
|
||||
@@ -0,0 +1,430 @@
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "console.h"
|
||||
#include "chat.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
#include <vector>
|
||||
#include <limits.h>
|
||||
#include <cinttypes>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
// volatile, because of signal being an interrupt
|
||||
static volatile bool g_is_generating = false;
|
||||
static volatile bool g_is_interrupted = false;
|
||||
|
||||
/**
|
||||
* Please note that this is NOT a production-ready stuff.
|
||||
* It is a playground for trying multimodal support in llama.cpp.
|
||||
* For contributors: please keep this code simple and easy to understand.
|
||||
*/
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG(
|
||||
"Experimental CLI for multimodal\n\n"
|
||||
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
|
||||
" -m and --mmproj are required\n"
|
||||
" -hf user/repo can replace both -m and --mmproj in most cases\n"
|
||||
" --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
|
||||
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
|
||||
argv[0]
|
||||
);
|
||||
}
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (g_is_generating) {
|
||||
g_is_generating = false;
|
||||
} else {
|
||||
console::cleanup();
|
||||
if (g_is_interrupted) {
|
||||
_exit(1);
|
||||
}
|
||||
g_is_interrupted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
struct mtmd_cli_context {
|
||||
mtmd::context_ptr ctx_vision;
|
||||
common_init_result_ptr llama_init;
|
||||
|
||||
llama_model * model;
|
||||
llama_context * lctx;
|
||||
const llama_vocab * vocab;
|
||||
common_sampler * smpl;
|
||||
llama_batch batch;
|
||||
int n_batch;
|
||||
|
||||
mtmd::bitmaps bitmaps;
|
||||
|
||||
// chat template
|
||||
common_chat_templates_ptr tmpls;
|
||||
std::vector<common_chat_msg> chat_history;
|
||||
bool use_jinja = false;
|
||||
// TODO: support for --system-prompt with /clear command
|
||||
|
||||
// support for legacy templates (models not having EOT token)
|
||||
llama_tokens antiprompt_tokens;
|
||||
|
||||
int n_threads = 1;
|
||||
llama_pos n_past = 0;
|
||||
|
||||
mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
|
||||
model = llama_init->model();
|
||||
lctx = llama_init->context();
|
||||
vocab = llama_model_get_vocab(model);
|
||||
smpl = common_sampler_init(model, params.sampling);
|
||||
n_threads = params.cpuparams.n_threads;
|
||||
batch = llama_batch_init(1, 0, 1); // batch for next token generation
|
||||
n_batch = params.n_batch;
|
||||
|
||||
if (!model || !lctx) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
|
||||
LOG_ERR("Model does not have chat template.\n");
|
||||
LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n");
|
||||
LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n");
|
||||
LOG_ERR(" For Mistral Small 3.1, use '--chat-template mistral-v7'\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
tmpls = common_chat_templates_init(model, params.chat_template);
|
||||
use_jinja = params.use_jinja;
|
||||
chat_history.clear();
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
|
||||
|
||||
init_vision_context(params);
|
||||
|
||||
// load antiprompt tokens for legacy templates
|
||||
if (params.chat_template == "vicuna") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
|
||||
} else if (params.chat_template == "deepseek") {
|
||||
antiprompt_tokens = common_tokenize(lctx, "###", false, true);
|
||||
}
|
||||
}
|
||||
|
||||
~mtmd_cli_context() {
|
||||
llama_batch_free(batch);
|
||||
common_sampler_free(smpl);
|
||||
}
|
||||
|
||||
void init_vision_context(common_params & params) {
|
||||
const char * clip_path = params.mmproj.path.c_str();
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
mparams.use_gpu = params.mmproj_use_gpu;
|
||||
mparams.print_timings = true;
|
||||
mparams.n_threads = params.cpuparams.n_threads;
|
||||
mparams.flash_attn_type = params.flash_attn_type;
|
||||
mparams.warmup = params.warmup;
|
||||
mparams.image_min_tokens = params.image_min_tokens;
|
||||
mparams.image_max_tokens = params.image_max_tokens;
|
||||
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
|
||||
if (!ctx_vision.get()) {
|
||||
LOG_ERR("Failed to load vision model from %s\n", clip_path);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
bool check_antiprompt(const llama_tokens & generated_tokens) {
|
||||
if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(
|
||||
generated_tokens.end() - antiprompt_tokens.size(),
|
||||
generated_tokens.end(),
|
||||
antiprompt_tokens.begin()
|
||||
);
|
||||
}
|
||||
|
||||
bool load_media(const std::string & fname) {
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
|
||||
if (!bmp.ptr) {
|
||||
return false;
|
||||
}
|
||||
bitmaps.entries.push_back(std::move(bmp));
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static int generate_response(mtmd_cli_context & ctx, int n_predict) {
|
||||
llama_tokens generated_tokens;
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
if (i > n_predict || !g_is_generating || g_is_interrupted) {
|
||||
LOG("\n");
|
||||
break;
|
||||
}
|
||||
|
||||
llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
|
||||
generated_tokens.push_back(token_id);
|
||||
common_sampler_accept(ctx.smpl, token_id, true);
|
||||
|
||||
if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
|
||||
LOG("\n");
|
||||
break; // end of generation
|
||||
}
|
||||
|
||||
LOG("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
if (g_is_interrupted) {
|
||||
LOG("\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// eval the token
|
||||
common_batch_clear(ctx.batch);
|
||||
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
|
||||
if (llama_decode(ctx.lctx, ctx.batch)) {
|
||||
LOG_ERR("failed to decode token\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
|
||||
common_chat_msg msg;
|
||||
msg.role = "assistant";
|
||||
msg.content = generated_text;
|
||||
ctx.chat_history.push_back(std::move(msg));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
|
||||
LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
|
||||
new_msg.role.c_str(), new_msg.content.c_str());
|
||||
auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
|
||||
new_msg, new_msg.role == "user",
|
||||
ctx.use_jinja);
|
||||
ctx.chat_history.push_back(new_msg);
|
||||
return formatted;
|
||||
}
|
||||
|
||||
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
|
||||
bool add_bos = ctx.chat_history.empty();
|
||||
auto formatted_chat = chat_add_and_format(ctx, msg);
|
||||
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
|
||||
|
||||
mtmd_input_text text;
|
||||
text.text = formatted_chat.c_str();
|
||||
text.add_special = add_bos;
|
||||
text.parse_special = true;
|
||||
|
||||
if (g_is_interrupted) return 0;
|
||||
|
||||
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
||||
auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
|
||||
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
|
||||
chunks.ptr.get(), // output
|
||||
&text, // text
|
||||
bitmaps_c_ptr.data(),
|
||||
bitmaps_c_ptr.size());
|
||||
if (res != 0) {
|
||||
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx.bitmaps.entries.clear();
|
||||
|
||||
llama_pos new_n_past;
|
||||
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
|
||||
ctx.lctx, // lctx
|
||||
chunks.ptr.get(), // chunks
|
||||
ctx.n_past, // n_past
|
||||
0, // seq_id
|
||||
ctx.n_batch, // n_batch
|
||||
true, // logits_last
|
||||
&new_n_past)) {
|
||||
LOG_ERR("Unable to eval prompt\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
ctx.n_past = new_n_past;
|
||||
|
||||
LOG("\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
|
||||
if (params.mmproj.path.empty()) {
|
||||
show_additional_info(argc, argv);
|
||||
LOG_ERR("ERR: Missing --mmproj argument\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
mtmd_cli_context ctx(params);
|
||||
LOG_INF("%s: loading model: %s\n", __func__, params.model.path.c_str());
|
||||
|
||||
bool is_single_turn = !params.prompt.empty() && !params.image.empty();
|
||||
|
||||
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
||||
|
||||
// Ctrl+C handling
|
||||
{
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (g_is_interrupted) return 130;
|
||||
|
||||
auto eval_system_prompt_if_present = [&] {
|
||||
if (params.system_prompt.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
common_chat_msg msg;
|
||||
msg.role = "system";
|
||||
msg.content = params.system_prompt;
|
||||
return eval_message(ctx, msg);
|
||||
};
|
||||
|
||||
LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
|
||||
LOG_WRN(" For normal use cases, please use the standard llama-cli\n");
|
||||
|
||||
if (eval_system_prompt_if_present()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
// most models require the marker before each image
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
|
||||
params.prompt = mtmd_default_marker() + params.prompt;
|
||||
}
|
||||
}
|
||||
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = params.prompt;
|
||||
for (const auto & image : params.image) {
|
||||
if (!ctx.load_media(image)) {
|
||||
return 1; // error is already printed by libmtmd
|
||||
}
|
||||
}
|
||||
if (eval_message(ctx, msg)) {
|
||||
return 1;
|
||||
}
|
||||
if (!g_is_interrupted && generate_response(ctx, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
LOG("\n Running in chat mode, available commands:");
|
||||
if (mtmd_support_vision(ctx.ctx_vision.get())) {
|
||||
LOG("\n /image <path> load an image");
|
||||
}
|
||||
if (mtmd_support_audio(ctx.ctx_vision.get())) {
|
||||
LOG("\n /audio <path> load an audio");
|
||||
}
|
||||
LOG("\n /clear clear the chat history");
|
||||
LOG("\n /quit or /exit exit the program");
|
||||
LOG("\n");
|
||||
|
||||
std::string content;
|
||||
|
||||
while (!g_is_interrupted) {
|
||||
g_is_generating = false;
|
||||
LOG("\n> ");
|
||||
console::set_display(DISPLAY_TYPE_USER_INPUT);
|
||||
std::string line;
|
||||
console::readline(line, false);
|
||||
if (g_is_interrupted) break;
|
||||
console::set_display(DISPLAY_TYPE_RESET);
|
||||
line = string_strip(line);
|
||||
if (line.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (line == "/quit" || line == "/exit") {
|
||||
break;
|
||||
}
|
||||
if (line == "/clear") {
|
||||
ctx.n_past = 0;
|
||||
ctx.chat_history.clear();
|
||||
llama_memory_clear(llama_get_memory(ctx.lctx), true);
|
||||
if (eval_system_prompt_if_present()) {
|
||||
return 1;
|
||||
}
|
||||
LOG("Chat history cleared\n\n");
|
||||
continue;
|
||||
}
|
||||
g_is_generating = true;
|
||||
bool is_image = line == "/image" || line.find("/image ") == 0;
|
||||
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
|
||||
if (is_image || is_audio) {
|
||||
if (line.size() < 8) {
|
||||
LOG_ERR("ERR: Missing media filename\n");
|
||||
continue;
|
||||
}
|
||||
std::string media_path = line.substr(7);
|
||||
if (ctx.load_media(media_path)) {
|
||||
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
|
||||
content += mtmd_default_marker();
|
||||
}
|
||||
// else, error is already printed by libmtmd
|
||||
continue;
|
||||
} else {
|
||||
content += line;
|
||||
}
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = content;
|
||||
int ret = eval_message(ctx, msg);
|
||||
if (ret) {
|
||||
return 1;
|
||||
}
|
||||
if (g_is_interrupted) break;
|
||||
if (generate_response(ctx, n_predict)) {
|
||||
return 1;
|
||||
}
|
||||
content.clear();
|
||||
}
|
||||
}
|
||||
if (g_is_interrupted) LOG("\nInterrupted by user\n");
|
||||
LOG("\n\n");
|
||||
llama_perf_context_print(ctx.lctx);
|
||||
return g_is_interrupted ? 130 : 0;
|
||||
}
|
||||
@@ -0,0 +1,521 @@
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <vector>
|
||||
|
||||
//#define MTMD_AUDIO_DEBUG
|
||||
|
||||
#define MINIAUDIO_IMPLEMENTATION
|
||||
#ifndef MTMD_AUDIO_DEBUG
|
||||
# define MA_NO_ENCODING
|
||||
#endif
|
||||
#define MA_NO_DEVICE_IO
|
||||
#define MA_NO_RESOURCE_MANAGER
|
||||
#define MA_NO_NODE_GRAPH
|
||||
#define MA_NO_ENGINE
|
||||
#define MA_NO_GENERATION
|
||||
#define MA_API static
|
||||
#include "miniaudio/miniaudio.h"
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb/stb_image.h"
|
||||
|
||||
#ifdef MTMD_INTERNAL_HEADER
|
||||
#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
|
||||
#endif
|
||||
|
||||
//
|
||||
// internal logging functions
|
||||
//
|
||||
|
||||
struct mtmd_helper_logger {
|
||||
ggml_log_callback default_callback = [](ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
fputs(text, stderr);
|
||||
fflush(stderr);
|
||||
};
|
||||
|
||||
ggml_log_callback log_callback = default_callback;
|
||||
void * log_callback_user_data;
|
||||
|
||||
void log_v(enum ggml_log_level level, const char * format, va_list args) {
|
||||
if (format == NULL) {
|
||||
return;
|
||||
}
|
||||
va_list args_copy;
|
||||
va_copy(args_copy, args);
|
||||
char buffer[128];
|
||||
int len = vsnprintf(buffer, 128, format, args);
|
||||
if (len < 128) {
|
||||
log_callback(level, buffer, log_callback_user_data);
|
||||
} else {
|
||||
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
|
||||
vsnprintf(buffer2, len + 1, format, args_copy);
|
||||
buffer2[len] = 0;
|
||||
log_callback(level, buffer2, log_callback_user_data);
|
||||
free(buffer2);
|
||||
}
|
||||
va_end(args_copy);
|
||||
}
|
||||
|
||||
void log(enum ggml_log_level level, const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
log_v(level, format, args);
|
||||
va_end(args);
|
||||
}
|
||||
} g_logger;
|
||||
|
||||
#define LOG_INF(...) g_logger.log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define LOG_WRN(...) g_logger.log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define LOG_ERR(...) g_logger.log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
|
||||
void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
if (log_callback == nullptr) {
|
||||
log_callback = g_logger.default_callback;
|
||||
}
|
||||
g_logger.log_callback = log_callback;
|
||||
g_logger.log_callback_user_data = user_data;
|
||||
mtmd_log_set(log_callback, user_data);
|
||||
}
|
||||
|
||||
//
|
||||
// helper functions
|
||||
//
|
||||
|
||||
size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
|
||||
size_t n_tokens = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
|
||||
}
|
||||
return n_tokens;
|
||||
}
|
||||
|
||||
llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
|
||||
llama_pos n_pos = 0;
|
||||
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
n_pos += mtmd_input_chunk_get_n_pos(chunk);
|
||||
}
|
||||
return n_pos;
|
||||
}
|
||||
|
||||
// helper struct to make working with embd batch easier
|
||||
// note: this will be removed after llama_batch_ext refactoring
|
||||
struct decode_embd_batch {
|
||||
int n_pos_per_embd;
|
||||
int n_mmproj_embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<llama_pos> pos_view; // used by mrope
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
}
|
||||
|
||||
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
int i = y * nx + x;
|
||||
pos[i ] = pos_0;
|
||||
pos[i + batch.n_tokens ] = pos_0 + y;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + x;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
// M-RoPE for audio
|
||||
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||
if (n_pos_per_embd > 1) {
|
||||
// mrope
|
||||
// for example, with layout of src: 1234...1234...1234...1234...
|
||||
// offset 2 will give us dst: 34...34...34...34...
|
||||
for (int i = 0; i < n_pos_per_embd; i++) {
|
||||
// assume n_tokens is less than or equal to batch.n_tokens
|
||||
// batch.n_tokens is number of **total** tokens
|
||||
// n_tokens is number of viewed token
|
||||
size_t src_idx = i * batch.n_tokens + offset;
|
||||
pos_view.insert(pos_view.end(),
|
||||
pos.data() + src_idx,
|
||||
pos.data() + src_idx + n_tokens);
|
||||
}
|
||||
pos_ptr = pos_view.data();
|
||||
} else {
|
||||
// normal
|
||||
pos_ptr = pos.data() + offset;
|
||||
}
|
||||
return {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||
/*pos =*/ pos_ptr,
|
||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||
/*seq_id =*/ batch.seq_id + offset,
|
||||
/*logits =*/ batch.logits + offset,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function for decoding an image whose embeddings have already been calculated
|
||||
int32_t mtmd_helper_decode_image_chunk(
|
||||
mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(lctx);
|
||||
int n_mmproj_embd = llama_model_n_embd_inp(model);
|
||||
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
|
||||
|
||||
int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
batch_embd.set_position_mrope_1d(n_past, seq_id);
|
||||
} else {
|
||||
GGML_ABORT("invalid chunk type for M-RoPE");
|
||||
}
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, false);
|
||||
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
|
||||
}
|
||||
|
||||
while (i_batch < n_img_batches) { // split into batches
|
||||
int pos_offset = i_batch*n_batch;
|
||||
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
|
||||
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
|
||||
|
||||
LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
int32_t ret = llama_decode(lctx, batch_embd_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_set_causal_attn(lctx, true); // restore causal attn
|
||||
return ret;
|
||||
}
|
||||
|
||||
LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
|
||||
|
||||
i_batch++;
|
||||
}
|
||||
|
||||
n_past += mtmd_input_chunk_get_n_pos(chunk);
|
||||
*new_n_past = n_past;
|
||||
|
||||
if (mtmd_decode_use_non_causal(ctx)) {
|
||||
llama_set_causal_attn(lctx, true);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
int32_t ret;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens;
|
||||
const auto tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
||||
// LOG_INF("decoding text chunk, n_tokens = %zu\n", n_tokens);
|
||||
size_t i = 0;
|
||||
while (i < n_tokens) { // split into batches
|
||||
text_batch.n_tokens = 0; // clear the batch
|
||||
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
|
||||
int32_t j = text_batch.n_tokens;
|
||||
text_batch.token [j] = tokens[i];
|
||||
text_batch.pos [j] = n_past++;
|
||||
text_batch.n_seq_id[j] = 1;
|
||||
text_batch.seq_id [j][0] = seq_id;
|
||||
text_batch.logits [j] = false;
|
||||
|
||||
text_batch.n_tokens++;
|
||||
}
|
||||
bool is_last_token = (i == n_tokens);
|
||||
if (logits_last && is_last_token) {
|
||||
text_batch.logits[text_batch.n_tokens - 1] = true;
|
||||
}
|
||||
ret = llama_decode(lctx, text_batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode text\n");
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
*new_n_past += text_batch.n_tokens;
|
||||
}
|
||||
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
int64_t t0 = ggml_time_ms();
|
||||
|
||||
LOG_INF("encoding %s slice...\n", name);
|
||||
|
||||
ret = mtmd_encode_chunk(ctx, chunk);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to encode %s slice\n", name);
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
|
||||
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode %s\n", name);
|
||||
llama_batch_free(text_batch);
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("chunk type not supported");
|
||||
}
|
||||
|
||||
llama_batch_free(text_batch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
size_t n_chunks = mtmd_input_chunks_size(chunks);
|
||||
if (n_chunks == 0) {
|
||||
LOG_WRN("no chunks to eval\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_chunks; i++) {
|
||||
bool chunk_logits_last = (i == n_chunks - 1) && logits_last;
|
||||
auto chunk = mtmd_input_chunks_get(chunks, i);
|
||||
|
||||
int32_t res = mtmd_helper_eval_chunk_single(ctx, lctx, chunk, n_past, seq_id, n_batch, chunk_logits_last, &n_past);
|
||||
if (res != 0) {
|
||||
LOG_ERR("failed to eval chunk %zu\n", i);
|
||||
return res;
|
||||
}
|
||||
*new_n_past = n_past;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace audio_helpers {
|
||||
|
||||
static bool is_audio_file(const char * buf, size_t len) {
|
||||
if (len < 12) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
||||
bool is_mp3 = len >= 3 && (
|
||||
memcmp(buf, "ID3", 3) == 0 ||
|
||||
// Check for MPEG sync word (simplified check)
|
||||
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
||||
);
|
||||
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
||||
|
||||
return is_wav || is_mp3 || is_flac;
|
||||
}
|
||||
|
||||
// returns true if the buffer is a valid audio file
|
||||
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
||||
ma_result result;
|
||||
const int channels = 1;
|
||||
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
||||
ma_decoder decoder;
|
||||
|
||||
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
||||
if (result != MA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ma_uint64 frame_count;
|
||||
ma_uint64 frames_read;
|
||||
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
pcmf32_mono.resize(frame_count);
|
||||
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef MTMD_AUDIO_DEBUG
|
||||
// save audio to wav file
|
||||
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
||||
ma_encoder encoder;
|
||||
ma_encoder_init_file("output.wav", &config, &encoder);
|
||||
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
||||
ma_encoder_uninit(&encoder);
|
||||
#endif
|
||||
|
||||
ma_decoder_uninit(&decoder);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace audio_helpers
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
|
||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||
std::vector<float> pcmf32;
|
||||
int bitrate = mtmd_get_audio_bitrate(ctx);
|
||||
if (bitrate < 0) {
|
||||
LOG_ERR("This model does not support audio input\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
|
||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||
return nullptr;
|
||||
}
|
||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
||||
}
|
||||
|
||||
// otherwise, we assume it's an image
|
||||
mtmd_bitmap * result = nullptr;
|
||||
{
|
||||
int nx, ny, nc;
|
||||
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
result = mtmd_bitmap_init(nx, ny, data);
|
||||
stbi_image_free(data);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
|
||||
std::vector<unsigned char> buf;
|
||||
FILE * f = fopen(fname, "rb");
|
||||
if (!f) {
|
||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
buf.resize(file_size);
|
||||
|
||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||
fclose(f);
|
||||
if (n_read != (size_t)file_size) {
|
||||
LOG_ERR("Failed to read entire file %s", fname);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
#ifndef MTMD_HELPER_H
|
||||
#define MTMD_HELPER_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// libmtmd helper functions
|
||||
//
|
||||
// Please note that these helpers are not guaranteed to be stable.
|
||||
// BREAKING CHANGES are expected.
|
||||
//
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
// Note: this also call mtmd_log_set() internally
|
||||
MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a file
|
||||
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
||||
// supported formats:
|
||||
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
||||
// audio: formats supported by miniaudio: wav, mp3, flac
|
||||
// note: audio files will be auto-detected based on magic bytes
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
||||
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// otherwise, returns 0 on success
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// helper function to decode an image whose embeddings have already been calculated
|
||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
//
|
||||
// C++ wrappers
|
||||
//
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,315 @@
|
||||
#ifndef MTMD_H
|
||||
#define MTMD_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
#endif
|
||||
|
||||
/**
|
||||
* libmtmd: A library for multimodal support in llama.cpp.
|
||||
*
|
||||
* WARNING: This API is experimental and subject to many BREAKING CHANGES.
|
||||
* Issues related to API usage may receive lower priority support.
|
||||
*
|
||||
* For the usage, see an example in mtmd-cli.cpp
|
||||
*
|
||||
* For contributors:
|
||||
* - Make sure the C API is aligned with the libllama C API (as in llama.h)
|
||||
* - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
|
||||
* - Keep the API minimal, do not expose internal details unless necessary
|
||||
*
|
||||
* IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
|
||||
* We encourage human contributors to ensure the quality and reliability of the codebase.
|
||||
*/
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define MTMD_API __declspec(dllexport)
|
||||
# else
|
||||
# define MTMD_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define MTMD_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define MTMD_API
|
||||
#endif
|
||||
|
||||
// deprecated marker, use mtmd_default_marker() instead
|
||||
#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum mtmd_input_chunk_type {
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
||||
};
|
||||
|
||||
// opaque types
|
||||
struct mtmd_context;
|
||||
struct mtmd_bitmap;
|
||||
struct mtmd_image_tokens;
|
||||
struct mtmd_input_chunk;
|
||||
struct mtmd_input_chunks;
|
||||
|
||||
struct mtmd_input_text {
|
||||
const char * text;
|
||||
bool add_special;
|
||||
bool parse_special;
|
||||
};
|
||||
|
||||
//
|
||||
// C API
|
||||
//
|
||||
|
||||
typedef struct mtmd_context mtmd_context;
|
||||
typedef struct mtmd_bitmap mtmd_bitmap;
|
||||
typedef struct mtmd_image_tokens mtmd_image_tokens;
|
||||
typedef struct mtmd_input_chunk mtmd_input_chunk;
|
||||
typedef struct mtmd_input_chunks mtmd_input_chunks;
|
||||
typedef struct mtmd_input_text mtmd_input_text;
|
||||
|
||||
struct mtmd_context_params {
|
||||
bool use_gpu;
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
const char * image_marker; // deprecated, use media_marker instead
|
||||
const char * media_marker;
|
||||
enum llama_flash_attn_type flash_attn_type;
|
||||
bool warmup; // whether to run a warmup encode pass after initialization
|
||||
|
||||
// limit number of image tokens, only for vision models with dynamic resolution
|
||||
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
||||
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
|
||||
};
|
||||
|
||||
MTMD_API const char * mtmd_default_marker(void);
|
||||
|
||||
MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
|
||||
|
||||
// initialize the mtmd context
|
||||
// return nullptr on failure
|
||||
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
const struct llama_model * text_model,
|
||||
const struct mtmd_context_params ctx_params);
|
||||
|
||||
MTMD_API void mtmd_free(mtmd_context * ctx);
|
||||
|
||||
// whether we need to set non-causal mask before llama_decode
|
||||
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
|
||||
|
||||
// whether the current model use M-RoPE for llama_decode
|
||||
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
|
||||
|
||||
// whether the current model supports vision input
|
||||
MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
||||
|
||||
// whether the current model supports audio input
|
||||
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||
|
||||
// get audio bitrate in Hz, for example 16000 for Whisper
|
||||
// return -1 if audio is not supported
|
||||
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
||||
|
||||
// mtmd_bitmap
|
||||
//
|
||||
// if bitmap is image:
|
||||
// length of data must be nx * ny * 3
|
||||
// the data is in RGBRGBRGB... format
|
||||
// if bitmap is audio:
|
||||
// length of data must be n_samples * sizeof(float)
|
||||
// the data is in float format (PCM F32)
|
||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
|
||||
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
|
||||
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
|
||||
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
|
||||
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
|
||||
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
|
||||
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
|
||||
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
|
||||
// bitmap ID is optional, but useful for KV cache tracking
|
||||
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
|
||||
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
|
||||
MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
|
||||
|
||||
|
||||
// mtmd_input_chunks
|
||||
//
|
||||
// this is simply a list of mtmd_input_chunk
|
||||
// the elements can only be populated via mtmd_tokenize()
|
||||
MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void);
|
||||
MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
|
||||
MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
|
||||
MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
|
||||
|
||||
// mtmd_input_chunk
|
||||
//
|
||||
// the instance will be constructed via mtmd_tokenize()
|
||||
// it will be freed along with mtmd_input_chunks
|
||||
MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk);
|
||||
MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
|
||||
MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
|
||||
MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk);
|
||||
// returns nullptr for ID on text chunk
|
||||
MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk);
|
||||
// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
|
||||
MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
|
||||
|
||||
// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
|
||||
// you can move the chunk ownership to your own code by copying it
|
||||
// remember to free the chunk when you are done with it
|
||||
MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
|
||||
MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
|
||||
|
||||
|
||||
// mtmd_image_tokens
|
||||
//
|
||||
// the instance will be constructed via mtmd_tokenize()
|
||||
// it will be freed along with mtmd_input_chunk
|
||||
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
|
||||
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
|
||||
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
|
||||
|
||||
// tokenize an input text prompt and a list of bitmaps (images/audio)
|
||||
// the prompt must have the input image marker (default: "<__media__>") in it
|
||||
// the default marker is defined by mtmd_default_marker()
|
||||
// the marker will be replaced with the image/audio chunk
|
||||
// for example:
|
||||
// "here is an image: <__media__>\ndescribe it in detail."
|
||||
// this will gives 3 chunks:
|
||||
// 1. "here is an image: <start_of_image>"
|
||||
// 2. (image/audio tokens)
|
||||
// 3. "<end_of_image>\ndescribe it in detail."
|
||||
// number of bitmaps must be equal to the number of markers in the prompt
|
||||
// this function is thread-safe (shared ctx)
|
||||
// return values:
|
||||
// 0 on success
|
||||
// 1 on number of bitmaps not matching the number of markers
|
||||
// 2 on image preprocessing error
|
||||
MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
mtmd_input_chunks * output,
|
||||
const mtmd_input_text * text,
|
||||
const mtmd_bitmap ** bitmaps,
|
||||
size_t n_bitmaps);
|
||||
|
||||
// returns 0 on success
|
||||
// TODO: deprecate
|
||||
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
||||
const mtmd_image_tokens * image_tokens);
|
||||
|
||||
// returns 0 on success
|
||||
MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
|
||||
const mtmd_input_chunk * chunk);
|
||||
|
||||
// get output embeddings from the last encode pass
|
||||
// the reading size (in bytes) is equal to:
|
||||
// llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
|
||||
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
// test function, to be used in test-mtmd-c-api.c
|
||||
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
//
|
||||
// C++ wrappers
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
namespace mtmd {
|
||||
|
||||
struct mtmd_context_deleter {
|
||||
void operator()(mtmd_context * val) { mtmd_free(val); }
|
||||
};
|
||||
using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
||||
|
||||
struct mtmd_bitmap_deleter {
|
||||
void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
|
||||
};
|
||||
using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
|
||||
|
||||
struct mtmd_input_chunks_deleter {
|
||||
void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
|
||||
};
|
||||
using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
|
||||
|
||||
struct mtmd_input_chunk_deleter {
|
||||
void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
|
||||
};
|
||||
using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
|
||||
|
||||
struct bitmap {
|
||||
bitmap_ptr ptr;
|
||||
bitmap() : ptr(nullptr) {}
|
||||
bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
|
||||
bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
|
||||
bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
|
||||
ptr.reset(mtmd_bitmap_init(nx, ny, data));
|
||||
}
|
||||
~bitmap() = default;
|
||||
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
|
||||
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
|
||||
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
|
||||
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
|
||||
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
|
||||
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
|
||||
};
|
||||
|
||||
struct bitmaps {
|
||||
std::vector<bitmap> entries;
|
||||
~bitmaps() = default;
|
||||
// return list of pointers to mtmd_bitmap
|
||||
// example:
|
||||
// auto bitmaps_c_ptr = bitmaps.c_ptr();
|
||||
// int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
|
||||
std::vector<const mtmd_bitmap *> c_ptr() {
|
||||
std::vector<const mtmd_bitmap *> res(entries.size());
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
res[i] = entries[i].ptr.get();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
struct input_chunks {
|
||||
input_chunks_ptr ptr;
|
||||
input_chunks() = default;
|
||||
input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
|
||||
~input_chunks() = default;
|
||||
size_t size() { return mtmd_input_chunks_size(ptr.get()); }
|
||||
const mtmd_input_chunk * operator[](size_t idx) {
|
||||
return mtmd_input_chunks_get(ptr.get(), idx);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mtmd
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user