diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 90c2b7094c7..8e12ed36aec 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8579,6 +8579,107 @@ def set_gguf_parameters(self): self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) +@ModelBase.register("TalkieForCausalLM") +class TalkieModel(TextModel): + """Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF.""" + model_arch = gguf.MODEL_ARCH.TALKIE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + head_dim = self.hparams.get("head_dim") or ( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_rope_dimension_count(head_dim) + # Match PyTorch F.rms_norm default (effective eps ~0 on bf16). + self.gguf_writer.add_layer_norm_rms_eps(1e-9) + + def set_vocab(self): + # Read tiktoken vocab.txt directly (no tokenizer.json upstream). + from tiktoken.load import load_tiktoken_bpe + + vocab_path = self.dir_model / "vocab.txt" + if not vocab_path.exists(): + raise FileNotFoundError(f"vocab.txt not found at {vocab_path}") + mergeable_ranks = load_tiktoken_bpe(str(vocab_path)) + # Drop ranks >= 65535 (specials live there). See tokenizer.py:54. + mergeable_ranks = {k: v for k, v in mergeable_ranks.items() if v < 65535} + + # Reverse-engineer merges via QwenModel helpers. + merges: list[str] = [] + vocab: dict[str, int] = {} + for token_bytes, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token_bytes)] = rank + if len(token_bytes) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token_bytes, max_rank=rank) + if len(merged) == 2: + merges.append( + " ".join(map(QwenModel.token_bytes_to_string, merged)) + ) + + # IT specials at fixed ids; base only uses <|endoftext|>. + special_tokens = { + "<|endoftext|>": 65535, + "<|end|>": 65536, + "<|user|>": 65537, + "<|assistant|>": 65538, + "<|system|>": 65539, + } + vocab_size = self.hparams["vocab_size"] + special_tokens = {k: v for k, v in special_tokens.items() if v < vocab_size} + + reverse_vocab = {idx: tok for tok, idx in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + tok = reverse_vocab[i] + tokens.append(tok) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre("talkie") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + eos_id = special_tokens.get("<|end|>", special_tokens["<|endoftext|>"]) + self.gguf_writer.add_eos_token_id(eos_id) + self.gguf_writer.add_eot_token_id(eos_id) + self.gguf_writer.add_unk_token_id(special_tokens["<|endoftext|>"]) + self.gguf_writer.add_pad_token_id(special_tokens["<|endoftext|>"]) + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(False) + + chat_template = ( + "{% for m in messages %}<|{{ m.role }}|>{{ m.content }}<|end|>{% endfor %}" + "{% if add_generation_prompt %}<|assistant|>{% endif %}" + ) + self.gguf_writer.add_chat_template(chat_template) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # RoPE -theta: negate second half of head_dim of W_q/W_k so NEOX matches. + n_head = self.hparams["num_attention_heads"] + head_dim = self.hparams.get("head_dim") or ( + self.hparams["hidden_size"] // n_head + ) + if name.endswith(("self_attn.q_proj.weight", "self_attn.k_proj.weight")): + w = data_torch.view(n_head, head_dim, data_torch.shape[-1]).clone() + w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :] + data_torch = w.view(n_head * head_dim, -1) + + # Raw nn.Parameter scalars have no .weight suffix; add one for map_tensor_name. + canonical = name if name.endswith((".weight", ".bias")) else name + ".weight" + return [(self.map_tensor_name(canonical), data_torch)] + + @ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 83ae51ce9ce..9276d9ff535 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -437,6 +437,7 @@ class MODEL_ARCH(IntEnum): OLMO = auto() OLMO2 = auto() OLMOE = auto() + TALKIE = auto() OPENELM = auto() ARCTIC = auto() DEEPSEEK = auto() @@ -566,6 +567,11 @@ class MODEL_TENSOR(IntEnum): ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() LAYER_OUT_SCALE = auto() + ATTN_HEAD_GAIN = auto() # talkie + ATTN_ACT_GAIN = auto() # talkie + FFN_ACT_GAIN = auto() # talkie + EMBED_SKIP_SCALE = auto() # talkie + LM_HEAD_GAIN = auto() # talkie PER_LAYER_TOKEN_EMBD = auto() # gemma3n PER_LAYER_MODEL_PROJ = auto() # gemma3n PER_LAYER_INP_GATE = auto() # gemma3n @@ -923,6 +929,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO2: "olmo2", MODEL_ARCH.OLMOE: "olmoe", + MODEL_ARCH.TALKIE: "talkie", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK: "deepseek", @@ -1021,6 +1028,11 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_HEAD_GAIN: "blk.{bid}.attn_head_gain", + MODEL_TENSOR.ATTN_ACT_GAIN: "blk.{bid}.attn_act_gain", + MODEL_TENSOR.FFN_ACT_GAIN: "blk.{bid}.ffn_act_gain", + MODEL_TENSOR.EMBED_SKIP_SCALE: "blk.{bid}.embed_skip_scale", + MODEL_TENSOR.LM_HEAD_GAIN: "lm_head_gain", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", @@ -2663,6 +2675,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.TALKIE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_HEAD_GAIN, + MODEL_TENSOR.ATTN_ACT_GAIN, + MODEL_TENSOR.FFN_ACT_GAIN, + MODEL_TENSOR.EMBED_SKIP_SCALE, + MODEL_TENSOR.LM_HEAD_GAIN, + ], MODEL_ARCH.SEED_OSS: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ATTN_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 01a9b236000..204626a6148 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -80,6 +80,9 @@ class TensorNameMap: "model.transformer.ff_out", # llada "head.decoder", # modern-bert ), + MODEL_TENSOR.LM_HEAD_GAIN: ( + "lm_head_gain", # talkie + ), MODEL_TENSOR.DENSE_2_OUT: ( "dense_2_out", # embeddinggemma ), @@ -2138,6 +2141,18 @@ class TensorNameMap: MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: ( "model.layers.{bid}.shared_head.norm", ), + MODEL_TENSOR.ATTN_HEAD_GAIN: ( + "model.layers.{bid}.self_attn.head_gain", # talkie + ), + MODEL_TENSOR.ATTN_ACT_GAIN: ( + "model.layers.{bid}.attn_gain", # talkie + ), + MODEL_TENSOR.FFN_ACT_GAIN: ( + "model.layers.{bid}.mlp_gain", # talkie + ), + MODEL_TENSOR.EMBED_SKIP_SCALE: ( + "model.layers.{bid}.embed_skip", # talkie + ), } # architecture-specific block mappings diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 633a66fc665..31cb5e7fc20 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -133,6 +133,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, + { LLM_ARCH_TALKIE, "talkie" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -450,6 +451,12 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, + // talkie + { LLM_TENSOR_ATTN_HEAD_GAIN, "blk.%d.attn_head_gain" }, + { LLM_TENSOR_ATTN_ACT_GAIN, "blk.%d.attn_act_gain" }, + { LLM_TENSOR_FFN_ACT_GAIN, "blk.%d.ffn_act_gain" }, + { LLM_TENSOR_EMBED_SKIP_SCALE, "blk.%d.embed_skip_scale" }, + { LLM_TENSOR_LM_HEAD_GAIN, "lm_head_gain" }, { LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" }, { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" }, { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" }, @@ -767,6 +774,12 @@ static const std::map LLM_TENSOR_INFOS = { // Nemotron 3 Super {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // talkie - per-block scalar gains and per-head Q gain; lm_head gain is global. + {LLM_TENSOR_ATTN_HEAD_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ATTN_ACT_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_FFN_ACT_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_EMBED_SKIP_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_LM_HEAD_GAIN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index 8f335f5c7b3..5e15c6e6f89 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -137,6 +137,7 @@ enum llm_arch { LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, + LLM_ARCH_TALKIE, LLM_ARCH_UNKNOWN, }; @@ -554,6 +555,12 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + // talkie - per-block scalars and per-head gain + LLM_TENSOR_ATTN_HEAD_GAIN, + LLM_TENSOR_ATTN_ACT_GAIN, + LLM_TENSOR_FFN_ACT_GAIN, + LLM_TENSOR_EMBED_SKIP_SCALE, + LLM_TENSOR_LM_HEAD_GAIN, }; enum llm_tensor_layer { diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 6554a89b28a..a814a202380 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -79,6 +79,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED }, { "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN }, + { "talkie", LLM_CHAT_TEMPLATE_TALKIE }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -137,6 +138,11 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { + // Talkie shares Phi-3's role markers but has no newlines. + if (tmpl_contains("|>{{ message['content'] }}<|end|>") + || tmpl_contains("|>{{ m.content }}<|end|>")) { + return LLM_CHAT_TEMPLATE_TALKIE; + } return LLM_CHAT_TEMPLATE_PHI_3; } else if (tmpl_contains("[gMASK]")) { return LLM_CHAT_TEMPLATE_CHATGLM_4; @@ -919,6 +925,15 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|begin|>assistant"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_TALKIE) { + // <|role|>content<|end|>... no newlines, no BOS. + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>" << message->content << "<|end|>"; + } + if (add_ass) { + ss << "<|assistant|>"; + } } else { // template not supported return -1; diff --git a/src/llama-chat.h b/src/llama-chat.h index 13f936a946c..f41edd1fefa 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -59,6 +59,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_GROK_2, LLM_CHAT_TEMPLATE_PANGU_EMBED, LLM_CHAT_TEMPLATE_SOLAR_OPEN, + LLM_CHAT_TEMPLATE_TALKIE, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9e2a13cbd43..ad82ce785ba 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1871,6 +1871,19 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_TALKIE: + { + // Match PyTorch F.rms_norm default (effective eps ~0). + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); + if (hparams.f_norm_rms_eps == 0.0f) { + hparams.f_norm_rms_eps = 1e-9f; + } + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + switch (hparams.n_layer) { + case 40: type = LLM_TYPE_13B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_OLMOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5062,6 +5075,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); } } break; + case LLM_ARCH_TALKIE: + { + // No learnable RMSNorm weights; lm_head untied. + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + lm_head_gain = create_tensor(tn(LLM_TENSOR_LM_HEAD_GAIN, "weight"), {1}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + layer.attn_head_gain = create_tensor(tn(LLM_TENSOR_ATTN_HEAD_GAIN, "weight", i), {n_head}, 0); + layer.attn_act_gain = create_tensor(tn(LLM_TENSOR_ATTN_ACT_GAIN, "weight", i), {1}, 0); + layer.ffn_act_gain = create_tensor(tn(LLM_TENSOR_FFN_ACT_GAIN, "weight", i), {1}, 0); + layer.embed_skip_scale = create_tensor(tn(LLM_TENSOR_EMBED_SKIP_SCALE, "weight", i), {1}, 0); + } + } break; case LLM_ARCH_SEED_OSS: { const uint32_t head_dim = hparams.n_embd_head_k(); @@ -8817,6 +8853,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_TALKIE: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_OPENELM: { llm = std::make_unique(*this, params); @@ -9278,6 +9318,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_RND1: case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: + case LLM_ARCH_TALKIE: case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: diff --git a/src/llama-model.h b/src/llama-model.h index 5f101bd6374..494d2fafa49 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -494,6 +494,12 @@ struct llama_layer { struct llama_layer_shortconv shortconv; struct llama_layer_nextn nextn; + + // talkie - per-block scalars and per-head Q gain + struct ggml_tensor * attn_head_gain = nullptr; // [n_head] + struct ggml_tensor * attn_act_gain = nullptr; // [1] + struct ggml_tensor * ffn_act_gain = nullptr; // [1] + struct ggml_tensor * embed_skip_scale = nullptr; // [1] }; struct llama_device { @@ -550,6 +556,9 @@ struct llama_model { struct ggml_tensor * per_layer_model_proj = nullptr; struct ggml_tensor * per_layer_proj_norm = nullptr; + // talkie - global learnable scalar that multiplies the lm_head matrix. + struct ggml_tensor * lm_head_gain = nullptr; // [1] + std::vector layers; //Dense linear projections for SentenceTransformers models like embeddinggemma diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 163f222ef61..340a9bf17c2 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -368,6 +368,12 @@ struct llm_tokenizer_bpe : llm_tokenizer { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }; break; + case LLAMA_VOCAB_PRE_TYPE_TALKIE: + // talkie/src/talkie/tokenizer.py:11-21 + regex_exprs = { + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: @@ -2035,6 +2041,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "olmo") { pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO; + } else if ( + tokenizer_pre == "talkie") { + pre_type = LLAMA_VOCAB_PRE_TYPE_TALKIE; + clean_spaces = false; } else if ( tokenizer_pre == "dbrx") { pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index dd38f45d3a2..c4ac6bc14c2 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -59,6 +59,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, + LLAMA_VOCAB_PRE_TYPE_TALKIE = 51, }; struct LLM_KV; diff --git a/src/models/models.h b/src/models/models.h index 94991c55fe8..3103cdfd388 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -472,6 +472,10 @@ struct llm_build_olmoe : public llm_graph_context { llm_build_olmoe(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_talkie : public llm_graph_context { + llm_build_talkie(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_olmo : public llm_graph_context { llm_build_olmo(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp new file mode 100644 index 00000000000..7be910f912c --- /dev/null +++ b/src/models/talkie.cpp @@ -0,0 +1,131 @@ +#include "models.h" + +// Talkie 1930 13B. Mirrors talkie/src/talkie/model.py. +// RoPE sign flip (-theta) is absorbed at convert time by negating +// the second half of head_dim of W_q/W_k. + +llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // e_x = RMSnorm(embd); same e_x added to every layer. + ggml_tensor * e_x = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, -1); + cb(e_x, "embed_post_norm", -1); + + // Residual stream starts as e_x (model.py:191). + inpL = e_x; + + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_pre_norm", il); + + // self-attention + { + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur_pre_rope", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur_pre_rope", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_post_rope", il); + cb(Kcur, "Kcur_post_rope", il); + + Kcur = build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_post_qknorm", il); + + // Q-RMSnorm + per-head gain (RMS_NORM+MUL fusion). + ggml_tensor * head_gain = ggml_reshape_3d(ctx0, model.layers[il].attn_head_gain, 1, n_head, 1); + Qcur = build_norm(Qcur, head_gain, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_post_headgain", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, model.layers[il].wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf((float) n_embd_head), il); + cb(cur, "attn_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + e_x = ggml_get_rows(ctx0, e_x, inp_out_ids); + } + + cur = ggml_mul(ctx0, cur, model.layers[il].attn_act_gain); + cb(cur, "attn_branch_scaled", il); + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "after_attn_residual", il); + + ggml_tensor * mlp_in = cur; + + cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, il); + cb(cur, "mlp_pre_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "mlp_out", il); + + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_act_gain); + cb(cur, "mlp_branch_scaled", il); + cur = ggml_add(ctx0, cur, mlp_in); + cb(cur, "after_mlp_residual", il); + + // embed-skip: cur += embed_skip * e_x. + ggml_tensor * e_x_scaled = ggml_mul(ctx0, e_x, model.layers[il].embed_skip_scale); + cb(e_x_scaled, "embed_skip_branch", il); + cur = ggml_add(ctx0, cur, e_x_scaled); + cb(cur, "after_embed_skip", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head with global gain via build_lora_mm 3-arg form. + cur = build_lora_mm(model.output, cur, model.lm_head_gain); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +}