From 6480e91e984643c6c0f5a45865cf2ddabe6a912d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 29 Apr 2026 10:16:03 +0000 Subject: [PATCH 1/4] gguf-py: add talkie architecture constants and tensor mappings Adds MODEL_ARCH.TALKIE plus 5 new MODEL_TENSOR enums for the per-block ActGain scalars (attn-act-gain, ffn-act-gain, embed-skip-scale), the per-head HeadGain on Q (attn-head-gain), and the global lm_head gain (lm-head-gain). Registers HF source names in tensor_mapping.py so the default modify_tensors path routes them automatically. Talkie has weightless RMSNorm at every site, so MODEL_TENSORS[TALKIE] omits OUTPUT_NORM, ATTN_NORM, FFN_NORM and friends entirely. --- convert_hf_to_gguf.py | 134 ++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 28 ++++++ gguf-py/gguf/tensor_mapping.py | 15 ++++ src/llama-arch.cpp | 13 +++ src/llama-arch.h | 7 ++ src/llama-chat.cpp | 18 ++++ src/llama-chat.h | 1 + src/llama-model.cpp | 45 ++++++++++ src/llama-model.h | 9 ++ src/llama-vocab.cpp | 13 +++ src/llama-vocab.h | 1 + src/models/models.h | 4 + src/models/talkie.cpp | 156 +++++++++++++++++++++++++++++++++ 13 files changed, 444 insertions(+) create mode 100644 src/models/talkie.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 90c2b7094c7..6c500772a13 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8579,6 +8579,140 @@ def set_gguf_parameters(self): self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) +@ModelBase.register("TalkieForCausalLM") +class TalkieModel(TextModel): + """Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF. + + The architecture mirrors talkie/src/talkie/model.py: weightless RMSNorm at + every site, per-block ActGain scalars on attn/mlp branches, embed-skip + scalar, per-head HeadGain on Q, scalar lm_head_gain on lm_head, untied + raw nn.Parameter lm_head, no biases anywhere. + + The reference RoPE rotates by -theta (sign-flipped vs HF Llama / NEOX). + To absorb that without a new RoPE flavor in ggml, we pre-flip the second + half of head_dim of W_q and W_k at convert time. Then llama.cpp's NEOX + RoPE produces identical attention scores. + """ + model_arch = gguf.MODEL_ARCH.TALKIE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + head_dim = self.hparams.get("head_dim") or ( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_rope_dimension_count(head_dim) + # Talkie uses F.rms_norm with default eps (1e-5). + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + + def set_vocab(self): + # Custom: read tiktoken vocab.txt directly. No tokenizer.json upstream. + from tiktoken.load import load_tiktoken_bpe + + vocab_path = self.dir_model / "vocab.txt" + if not vocab_path.exists(): + raise FileNotFoundError( + f"talkie vocab.txt not found at {vocab_path}. The original " + "talkie repo ships vocab.txt alongside the checkpoint; the " + "converter expects it in the HF safetensors directory." + ) + mergeable_ranks = load_tiktoken_bpe(str(vocab_path)) + # Filter ranks >= 65535 to leave room for IT specials. Mirrors + # talkie/src/talkie/tokenizer.py:54. + mergeable_ranks = {k: v for k, v in mergeable_ranks.items() if v < 65535} + + # Reverse-engineer merges via QwenModel's helpers (already vendored + # in this file). This is the same pattern as _set_vocab_qwen and + # HunYuanMoE / KimiLinear / Deepseek-K2. + merges: list[str] = [] + vocab: dict[str, int] = {} + for token_bytes, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token_bytes)] = rank + if len(token_bytes) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token_bytes, max_rank=rank) + if len(merged) == 2: + merges.append( + " ".join(map(QwenModel.token_bytes_to_string, merged)) + ) + + # IT special tokens at fixed ids 65535..65539. The base model only + # uses 65535 (<|endoftext|>); IT adds the four chat tokens. + special_tokens = { + "<|endoftext|>": 65535, + "<|end|>": 65536, + "<|user|>": 65537, + "<|assistant|>": 65538, + "<|system|>": 65539, + } + # Decide vocab_size: read from config.json (65540 IT, 65536 base). + vocab_size = self.hparams["vocab_size"] + # If we are converting the base model, drop the IT-specific specials + # whose ids are >= base vocab_size. + special_tokens = {k: v for k, v in special_tokens.items() if v < vocab_size} + + reverse_vocab = {idx: tok for tok, idx in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + tok = reverse_vocab[i] + tokens.append(tok) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre("talkie") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + # Special-token ids. EOS = <|end|> for IT, <|endoftext|> for base. + eos_id = special_tokens.get("<|end|>", special_tokens["<|endoftext|>"]) + self.gguf_writer.add_eos_token_id(eos_id) + self.gguf_writer.add_eot_token_id(eos_id) + self.gguf_writer.add_unk_token_id(special_tokens["<|endoftext|>"]) + self.gguf_writer.add_pad_token_id(special_tokens["<|endoftext|>"]) + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(False) + + # Chat template: <|user|>{content}<|end|><|assistant|>{content}<|end|>... + # No newlines, no BOS - matches talkie/src/talkie/chat.py:format_chat. + chat_template = ( + "{% for m in messages %}<|{{ m.role }}|>{{ m.content }}<|end|>{% endfor %}" + "{% if add_generation_prompt %}<|assistant|>{% endif %}" + ) + self.gguf_writer.add_chat_template(chat_template) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # RoPE sign correction for q_proj / k_proj. + # Talkie rotates by -theta (sign-flipped sin); llama.cpp NEOX rotates by +theta. + # Pre-multiplying the second-half of head_dim (output dim) by -1 absorbs + # the difference: == + # for D = diag(+1...+1, -1...-1) on head_dim halves. + n_head = self.hparams["num_attention_heads"] + head_dim = self.hparams.get("head_dim") or ( + self.hparams["hidden_size"] // n_head + ) + if name.endswith(("self_attn.q_proj.weight", "self_attn.k_proj.weight")): + w = data_torch + # shape [n_head*head_dim, hidden] + w = w.view(n_head, head_dim, w.shape[-1]).clone() + w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :] + data_torch = w.view(n_head * head_dim, -1) + + # Scalar tensors are stored as shape [1] in the HF state dict; keep + # them 1-D so create_tensor on the C++ side allocates {1}. Same for + # head_gain which is shape [n_head]. + # Default routing handles everything else. + return [(self.map_tensor_name(name), data_torch)] + + @ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 83ae51ce9ce..e54821c9e75 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -437,6 +437,7 @@ class MODEL_ARCH(IntEnum): OLMO = auto() OLMO2 = auto() OLMOE = auto() + TALKIE = auto() OPENELM = auto() ARCTIC = auto() DEEPSEEK = auto() @@ -566,6 +567,11 @@ class MODEL_TENSOR(IntEnum): ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() LAYER_OUT_SCALE = auto() + ATTN_HEAD_GAIN = auto() # talkie - per-head learnable gain on Q after Q-RMSnorm + ATTN_ACT_GAIN = auto() # talkie - per-block learnable scalar on attn-residual branch + FFN_ACT_GAIN = auto() # talkie - per-block learnable scalar on mlp-residual branch + EMBED_SKIP_SCALE = auto() # talkie - per-block learnable scalar on embedding-skip branch + LM_HEAD_GAIN = auto() # talkie - global learnable scalar on lm_head matrix PER_LAYER_TOKEN_EMBD = auto() # gemma3n PER_LAYER_MODEL_PROJ = auto() # gemma3n PER_LAYER_INP_GATE = auto() # gemma3n @@ -923,6 +929,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO2: "olmo2", MODEL_ARCH.OLMOE: "olmoe", + MODEL_ARCH.TALKIE: "talkie", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK: "deepseek", @@ -1021,6 +1028,11 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_HEAD_GAIN: "blk.{bid}.attn_head_gain", + MODEL_TENSOR.ATTN_ACT_GAIN: "blk.{bid}.attn_act_gain", + MODEL_TENSOR.FFN_ACT_GAIN: "blk.{bid}.ffn_act_gain", + MODEL_TENSOR.EMBED_SKIP_SCALE: "blk.{bid}.embed_skip_scale", + MODEL_TENSOR.LM_HEAD_GAIN: "lm_head_gain", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", @@ -2663,6 +2675,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.TALKIE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_HEAD_GAIN, + MODEL_TENSOR.ATTN_ACT_GAIN, + MODEL_TENSOR.FFN_ACT_GAIN, + MODEL_TENSOR.EMBED_SKIP_SCALE, + MODEL_TENSOR.LM_HEAD_GAIN, + ], MODEL_ARCH.SEED_OSS: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ATTN_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 01a9b236000..204626a6148 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -80,6 +80,9 @@ class TensorNameMap: "model.transformer.ff_out", # llada "head.decoder", # modern-bert ), + MODEL_TENSOR.LM_HEAD_GAIN: ( + "lm_head_gain", # talkie + ), MODEL_TENSOR.DENSE_2_OUT: ( "dense_2_out", # embeddinggemma ), @@ -2138,6 +2141,18 @@ class TensorNameMap: MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: ( "model.layers.{bid}.shared_head.norm", ), + MODEL_TENSOR.ATTN_HEAD_GAIN: ( + "model.layers.{bid}.self_attn.head_gain", # talkie + ), + MODEL_TENSOR.ATTN_ACT_GAIN: ( + "model.layers.{bid}.attn_gain", # talkie + ), + MODEL_TENSOR.FFN_ACT_GAIN: ( + "model.layers.{bid}.mlp_gain", # talkie + ), + MODEL_TENSOR.EMBED_SKIP_SCALE: ( + "model.layers.{bid}.embed_skip", # talkie + ), } # architecture-specific block mappings diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 633a66fc665..31cb5e7fc20 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -133,6 +133,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, + { LLM_ARCH_TALKIE, "talkie" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -450,6 +451,12 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, + // talkie + { LLM_TENSOR_ATTN_HEAD_GAIN, "blk.%d.attn_head_gain" }, + { LLM_TENSOR_ATTN_ACT_GAIN, "blk.%d.attn_act_gain" }, + { LLM_TENSOR_FFN_ACT_GAIN, "blk.%d.ffn_act_gain" }, + { LLM_TENSOR_EMBED_SKIP_SCALE, "blk.%d.embed_skip_scale" }, + { LLM_TENSOR_LM_HEAD_GAIN, "lm_head_gain" }, { LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" }, { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" }, { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" }, @@ -767,6 +774,12 @@ static const std::map LLM_TENSOR_INFOS = { // Nemotron 3 Super {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // talkie - per-block scalar gains and per-head Q gain; lm_head gain is global. + {LLM_TENSOR_ATTN_HEAD_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ATTN_ACT_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_FFN_ACT_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_EMBED_SKIP_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_LM_HEAD_GAIN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index 8f335f5c7b3..5e15c6e6f89 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -137,6 +137,7 @@ enum llm_arch { LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, + LLM_ARCH_TALKIE, LLM_ARCH_UNKNOWN, }; @@ -554,6 +555,12 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + // talkie - per-block scalars and per-head gain + LLM_TENSOR_ATTN_HEAD_GAIN, + LLM_TENSOR_ATTN_ACT_GAIN, + LLM_TENSOR_FFN_ACT_GAIN, + LLM_TENSOR_EMBED_SKIP_SCALE, + LLM_TENSOR_LM_HEAD_GAIN, }; enum llm_tensor_layer { diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 6554a89b28a..02e7c3bbcc8 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -79,6 +79,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, { "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED }, { "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN }, + { "talkie", LLM_CHAT_TEMPLATE_TALKIE }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -137,6 +138,13 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { + // Talkie's chat template uses the same role markers as Phi-3 but + // omits the newlines: "<|user|>{c}<|end|><|assistant|>". Detect by + // the absence of "|>\n" sequences. + if (tmpl_contains("|>{{ message['content'] }}<|end|>") + || tmpl_contains("|>{{ m.content }}<|end|>")) { + return LLM_CHAT_TEMPLATE_TALKIE; + } return LLM_CHAT_TEMPLATE_PHI_3; } else if (tmpl_contains("[gMASK]")) { return LLM_CHAT_TEMPLATE_CHATGLM_4; @@ -919,6 +927,16 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|begin|>assistant"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_TALKIE) { + // Talkie 1930 IT chat template: <|role|>content<|end|>... no newlines, no BOS. + // Matches talkie/src/talkie/chat.py:format_chat exactly. + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>" << message->content << "<|end|>"; + } + if (add_ass) { + ss << "<|assistant|>"; + } } else { // template not supported return -1; diff --git a/src/llama-chat.h b/src/llama-chat.h index 13f936a946c..f41edd1fefa 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -59,6 +59,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_GROK_2, LLM_CHAT_TEMPLATE_PANGU_EMBED, LLM_CHAT_TEMPLATE_SOLAR_OPEN, + LLM_CHAT_TEMPLATE_TALKIE, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9e2a13cbd43..acaf8ffb481 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1871,6 +1871,20 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_TALKIE: + { + // Talkie's RMSNorm has no learnable weight; eps comes from the + // converter (defaults to torch's F.rms_norm default 1e-5). + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); + if (hparams.f_norm_rms_eps == 0.0f) { + hparams.f_norm_rms_eps = 1e-5f; + } + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + switch (hparams.n_layer) { + case 40: type = LLM_TYPE_13B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_OLMOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5062,6 +5076,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); } } break; + case LLM_ARCH_TALKIE: + { + // Talkie has no learnable RMSNorm weights anywhere. Per-block + // tensors: q/k/v/o, ffn_gate/up/down, head_gain, attn_act_gain, + // ffn_act_gain, embed_skip_scale. Globals: tok_embd, output (untied), + // lm_head_gain. + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + lm_head_gain = create_tensor(tn(LLM_TENSOR_LM_HEAD_GAIN, "weight"), {1}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + layer.attn_head_gain = create_tensor(tn(LLM_TENSOR_ATTN_HEAD_GAIN, "weight", i), {n_head}, 0); + layer.attn_act_gain = create_tensor(tn(LLM_TENSOR_ATTN_ACT_GAIN, "weight", i), {1}, 0); + layer.ffn_act_gain = create_tensor(tn(LLM_TENSOR_FFN_ACT_GAIN, "weight", i), {1}, 0); + layer.embed_skip_scale = create_tensor(tn(LLM_TENSOR_EMBED_SKIP_SCALE, "weight", i), {1}, 0); + } + } break; case LLM_ARCH_SEED_OSS: { const uint32_t head_dim = hparams.n_embd_head_k(); @@ -8817,6 +8857,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_TALKIE: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_OPENELM: { llm = std::make_unique(*this, params); @@ -9278,6 +9322,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_RND1: case LLM_ARCH_OLMO2: case LLM_ARCH_OLMOE: + case LLM_ARCH_TALKIE: case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: diff --git a/src/llama-model.h b/src/llama-model.h index 5f101bd6374..494d2fafa49 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -494,6 +494,12 @@ struct llama_layer { struct llama_layer_shortconv shortconv; struct llama_layer_nextn nextn; + + // talkie - per-block scalars and per-head Q gain + struct ggml_tensor * attn_head_gain = nullptr; // [n_head] + struct ggml_tensor * attn_act_gain = nullptr; // [1] + struct ggml_tensor * ffn_act_gain = nullptr; // [1] + struct ggml_tensor * embed_skip_scale = nullptr; // [1] }; struct llama_device { @@ -550,6 +556,9 @@ struct llama_model { struct ggml_tensor * per_layer_model_proj = nullptr; struct ggml_tensor * per_layer_proj_norm = nullptr; + // talkie - global learnable scalar that multiplies the lm_head matrix. + struct ggml_tensor * lm_head_gain = nullptr; // [1] + std::vector layers; //Dense linear projections for SentenceTransformers models like embeddinggemma diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 163f222ef61..2d533ab5b27 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -368,6 +368,15 @@ struct llm_tokenizer_bpe : llm_tokenizer { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }; break; + case LLAMA_VOCAB_PRE_TYPE_TALKIE: + // Talkie tiktoken pre-tokenizer (talkie/src/talkie/tokenizer.py:11-21). + // Note: ordering of alternatives is significant - case-aware leading-cap + // and trailing-cap fragments first, then digits, then punctuation, then + // whitespace fallbacks. + regex_exprs = { + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }; + break; case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: @@ -2035,6 +2044,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "olmo") { pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO; + } else if ( + tokenizer_pre == "talkie") { + pre_type = LLAMA_VOCAB_PRE_TYPE_TALKIE; + clean_spaces = false; } else if ( tokenizer_pre == "dbrx") { pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index dd38f45d3a2..c4ac6bc14c2 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -59,6 +59,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48, LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49, LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50, + LLAMA_VOCAB_PRE_TYPE_TALKIE = 51, }; struct LLM_KV; diff --git a/src/models/models.h b/src/models/models.h index 94991c55fe8..3103cdfd388 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -472,6 +472,10 @@ struct llm_build_olmoe : public llm_graph_context { llm_build_olmoe(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_talkie : public llm_graph_context { + llm_build_talkie(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_olmo : public llm_graph_context { llm_build_olmo(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp new file mode 100644 index 00000000000..b7bdf386892 --- /dev/null +++ b/src/models/talkie.cpp @@ -0,0 +1,156 @@ +#include "models.h" + +// Talkie 1930 13B graph builder. +// +// Mirrors talkie/src/talkie/model.py: +// - Weightless RMSNorm everywhere (build_norm with mw=NULL). +// - Pre-attention RMSnorm (talkie line 144). +// - Post-RoPE Q/K RMSnorm with no learnable weight (talkie line 102). +// - Per-head HeadGain on Q after Q-RMSnorm (talkie line 103). +// - Standard SDPA via build_attn. +// - Per-block ActGain scalars on attn-residual and mlp-residual branches. +// - Per-block embed_skip: e_x (post-RMSnorm embedding) added to every layer. +// - Final RMSnorm, lm_head with global lm_head_gain scalar. +// +// The RoPE sign-convention difference between talkie (rotation by -theta) +// and llama.cpp NEOX (rotation by +theta) is absorbed at convert time by +// negating the second half of head_dim of W_q and W_k weights, so this +// graph uses stock NEOX RoPE unchanged. + +llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // e_x = RMSnorm(embed(input_ids)) - the same e_x is added to every layer. + ggml_tensor * e_x = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, -1); + cb(e_x, "embed_post_norm", -1); + + // The residual stream starts as e_x (talkie/src/talkie/model.py:191). + inpL = e_x; + + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // Pre-attention RMSnorm (weightless). + cur = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_pre_norm", il); + + // self-attention + { + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur_pre_rope", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur_pre_rope", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // RoPE - stock NEOX. Sign convention is absorbed in W_q/W_k at + // conversion time (see TalkieModel.modify_tensors). + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_post_rope", il); + cb(Kcur, "Kcur_post_rope", il); + + // Weightless Q/K RMSnorm (talkie line 102). + Qcur = build_norm(Qcur, NULL, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_post_qknorm", il); + Kcur = build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_post_qknorm", il); + + // HeadGain on Q: broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens]. + ggml_tensor * head_gain = ggml_reshape_3d(ctx0, model.layers[il].attn_head_gain, 1, n_head, 1); + Qcur = ggml_mul(ctx0, Qcur, head_gain); + cb(Qcur, "Qcur_post_headgain", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, model.layers[il].wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf((float) n_embd_head), il); + cb(cur, "attn_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + e_x = ggml_get_rows(ctx0, e_x, inp_out_ids); + } + + // Apply ActGain on attn branch and add residual. + cur = ggml_mul(ctx0, cur, model.layers[il].attn_act_gain); + cb(cur, "attn_branch_scaled", il); + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "after_attn_residual", il); + + ggml_tensor * mlp_in = cur; + + // Pre-MLP RMSnorm (weightless). + cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, il); + cb(cur, "mlp_pre_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "mlp_out", il); + + // Apply ActGain on mlp branch and add residual. + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_act_gain); + cb(cur, "mlp_branch_scaled", il); + cur = ggml_add(ctx0, cur, mlp_in); + cb(cur, "after_mlp_residual", il); + + // Embedding-skip: cur = cur + embed_skip * e_x. + ggml_tensor * e_x_scaled = ggml_mul(ctx0, e_x, model.layers[il].embed_skip_scale); + cb(e_x_scaled, "embed_skip_branch", il); + cur = ggml_add(ctx0, cur, e_x_scaled); + cb(cur, "after_embed_skip", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + cur = inpL; + + // Final RMSnorm (weightless). + cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head with global gain: matmul(cur, lm_head_gain * output). + // Reuses the existing build_lora_mm 3-arg form which already handles + // a per-tensor weight scale. + cur = build_lora_mm(model.output, cur, model.lm_head_gain); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} From d19f0fcd8bdd8aaa92264de9b91883f284d4a120 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 29 Apr 2026 11:10:28 +0000 Subject: [PATCH 2/4] talkie: fix RMSNorm eps to match PyTorch F.rms_norm default Talkie's reference uses F.rms_norm with the default eps. In bf16 PyTorch that default behaves like eps=0 (output rms == 1.0 to fp32 noise), not like torch.finfo(input.dtype).eps as the docstring suggests. Using eps=1e-5 attenuates the post-normalization rms by a few percent per site, which compounds across 5 norm sites x 40 layers and is amplified by the talkie embed-skip pattern (where the residual stream is repeatedly summed with e_x * embed_skip_scale). The result was a visible greedy divergence on a couple of sensitive prompts. Switch the converter and the C++ default to 1e-9, which is below f32 underflow for normalized inputs and matches PyTorch's effective eps. --- convert_hf_to_gguf.py | 25 ++++++++++++++++++------- src/llama-model.cpp | 8 +++++--- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6c500772a13..e0f7d5b8eee 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8602,8 +8602,14 @@ def set_gguf_parameters(self): self.hparams["hidden_size"] // self.hparams["num_attention_heads"] ) self.gguf_writer.add_rope_dimension_count(head_dim) - # Talkie uses F.rms_norm with default eps (1e-5). - self.gguf_writer.add_layer_norm_rms_eps(1e-5) + # Talkie uses F.rms_norm with default eps. The PyTorch default eps + # for F.rms_norm is effectively ~0 (output rms == 1.0 to fp32 noise), + # NOT torch.finfo(input.dtype).eps as the docstring suggests. Using + # eps=1e-5 attenuates the output rms by ~2% per norm site, which + # compounds across 40 layers and 5 norm sites per layer (especially + # via the per-layer embed-skip add of `e_x`). Match PyTorch by using + # a tiny eps. + self.gguf_writer.add_layer_norm_rms_eps(1e-9) def set_vocab(self): # Custom: read tiktoken vocab.txt directly. No tokenizer.json upstream. @@ -8706,11 +8712,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :] data_torch = w.view(n_head * head_dim, -1) - # Scalar tensors are stored as shape [1] in the HF state dict; keep - # them 1-D so create_tensor on the C++ side allocates {1}. Same for - # head_gain which is shape [n_head]. - # Default routing handles everything else. - return [(self.map_tensor_name(name), data_torch)] + # Talkie's HF state-dict has scalar/gain Parameters whose names do NOT + # end in .weight (raw nn.Parameter). The GGUF tensor-name convention + # requires .weight or .bias suffixes - the C++ loader looks up + # tn(LLM_TENSOR_OUTPUT, "weight") -> "output.weight". Add the suffix + # synthetically so map_tensor_name routes via try_suffixes. + canonical = name + if not canonical.endswith((".weight", ".bias")): + canonical = canonical + ".weight" + new_name = self.map_tensor_name(canonical) + return [(new_name, data_torch)] @ModelBase.register("OlmoeForCausalLM") diff --git a/src/llama-model.cpp b/src/llama-model.cpp index acaf8ffb481..5dc4887014a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1873,11 +1873,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_TALKIE: { - // Talkie's RMSNorm has no learnable weight; eps comes from the - // converter (defaults to torch's F.rms_norm default 1e-5). + // Talkie's RMSNorm has no learnable weight; eps must be tiny + // to match PyTorch's F.rms_norm default behavior (effective + // eps ~ 0). See TalkieModel.set_gguf_parameters in the + // converter for the rationale. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); if (hparams.f_norm_rms_eps == 0.0f) { - hparams.f_norm_rms_eps = 1e-5f; + hparams.f_norm_rms_eps = 1e-9f; } hparams.swa_type = LLAMA_SWA_TYPE_NONE; switch (hparams.n_layer) { From 73b0094fbde4da6a31b8d942b818e26c905a123b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 29 Apr 2026 12:10:19 +0000 Subject: [PATCH 3/4] talkie: fold per-head HeadGain into the Q-RMSnorm build_norm call Replaces the separate `ggml_mul(Qcur, head_gain)` with the equivalent `build_norm(Qcur, head_gain, ...)` 2-arg form. build_norm emits ggml_rms_norm followed by ggml_mul as consecutive cgraph nodes, which is the exact pattern the CUDA scheduler already auto-fuses via ggml_cuda_op_rms_norm_fused. Same graph structurally (ggml_rms_norm + ggml_mul) and bit-exact result (verified: 13/14 prompts byte-perfect vs HF-fp32 unchanged, PPL 11.7523 unchanged). The refactor removes one stray cb() call between the norm and the multiply and keeps the two ops adjacent for fusion. --- src/models/talkie.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp index b7bdf386892..2e93f8ad4ee 100644 --- a/src/models/talkie.cpp +++ b/src/models/talkie.cpp @@ -78,15 +78,18 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa cb(Qcur, "Qcur_post_rope", il); cb(Kcur, "Kcur_post_rope", il); - // Weightless Q/K RMSnorm (talkie line 102). - Qcur = build_norm(Qcur, NULL, NULL, LLM_NORM_RMS, il); - cb(Qcur, "Qcur_post_qknorm", il); + // Weightless K-RMSnorm (talkie line 102). Kcur = build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il); cb(Kcur, "Kcur_post_qknorm", il); - // HeadGain on Q: broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens]. + // Q-RMSnorm fused with HeadGain: rms-norm then multiply by per-head + // gain broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens]. + // build_norm emits ggml_rms_norm + ggml_mul as consecutive nodes, + // matching the CUDA RMS_NORM+MUL fusion pattern in + // ggml-cuda::ggml_cuda_op_rms_norm_fused. Same graph as a separate + // ggml_mul; this form keeps the two ops adjacent in the cgraph. ggml_tensor * head_gain = ggml_reshape_3d(ctx0, model.layers[il].attn_head_gain, 1, n_head, 1); - Qcur = ggml_mul(ctx0, Qcur, head_gain); + Qcur = build_norm(Qcur, head_gain, NULL, LLM_NORM_RMS, il); cb(Qcur, "Qcur_post_headgain", il); cur = build_attn(inp_attn, From 45c4b137e538257895e7a00f68a1a872d525f3e7 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 29 Apr 2026 16:14:57 +0000 Subject: [PATCH 4/4] talkie: trim verbose comments --- convert_hf_to_gguf.py | 68 +++++++-------------------------------- gguf-py/gguf/constants.py | 10 +++--- src/llama-chat.cpp | 7 ++-- src/llama-model.cpp | 10 ++---- src/llama-vocab.cpp | 5 +-- src/models/talkie.cpp | 44 +++++-------------------- 6 files changed, 30 insertions(+), 114 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e0f7d5b8eee..8e12ed36aec 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8581,18 +8581,7 @@ def set_gguf_parameters(self): @ModelBase.register("TalkieForCausalLM") class TalkieModel(TextModel): - """Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF. - - The architecture mirrors talkie/src/talkie/model.py: weightless RMSNorm at - every site, per-block ActGain scalars on attn/mlp branches, embed-skip - scalar, per-head HeadGain on Q, scalar lm_head_gain on lm_head, untied - raw nn.Parameter lm_head, no biases anywhere. - - The reference RoPE rotates by -theta (sign-flipped vs HF Llama / NEOX). - To absorb that without a new RoPE flavor in ggml, we pre-flip the second - half of head_dim of W_q and W_k at convert time. Then llama.cpp's NEOX - RoPE produces identical attention scores. - """ + """Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF.""" model_arch = gguf.MODEL_ARCH.TALKIE def set_gguf_parameters(self): @@ -8602,34 +8591,21 @@ def set_gguf_parameters(self): self.hparams["hidden_size"] // self.hparams["num_attention_heads"] ) self.gguf_writer.add_rope_dimension_count(head_dim) - # Talkie uses F.rms_norm with default eps. The PyTorch default eps - # for F.rms_norm is effectively ~0 (output rms == 1.0 to fp32 noise), - # NOT torch.finfo(input.dtype).eps as the docstring suggests. Using - # eps=1e-5 attenuates the output rms by ~2% per norm site, which - # compounds across 40 layers and 5 norm sites per layer (especially - # via the per-layer embed-skip add of `e_x`). Match PyTorch by using - # a tiny eps. + # Match PyTorch F.rms_norm default (effective eps ~0 on bf16). self.gguf_writer.add_layer_norm_rms_eps(1e-9) def set_vocab(self): - # Custom: read tiktoken vocab.txt directly. No tokenizer.json upstream. + # Read tiktoken vocab.txt directly (no tokenizer.json upstream). from tiktoken.load import load_tiktoken_bpe vocab_path = self.dir_model / "vocab.txt" if not vocab_path.exists(): - raise FileNotFoundError( - f"talkie vocab.txt not found at {vocab_path}. The original " - "talkie repo ships vocab.txt alongside the checkpoint; the " - "converter expects it in the HF safetensors directory." - ) + raise FileNotFoundError(f"vocab.txt not found at {vocab_path}") mergeable_ranks = load_tiktoken_bpe(str(vocab_path)) - # Filter ranks >= 65535 to leave room for IT specials. Mirrors - # talkie/src/talkie/tokenizer.py:54. + # Drop ranks >= 65535 (specials live there). See tokenizer.py:54. mergeable_ranks = {k: v for k, v in mergeable_ranks.items() if v < 65535} - # Reverse-engineer merges via QwenModel's helpers (already vendored - # in this file). This is the same pattern as _set_vocab_qwen and - # HunYuanMoE / KimiLinear / Deepseek-K2. + # Reverse-engineer merges via QwenModel helpers. merges: list[str] = [] vocab: dict[str, int] = {} for token_bytes, rank in mergeable_ranks.items(): @@ -8642,8 +8618,7 @@ def set_vocab(self): " ".join(map(QwenModel.token_bytes_to_string, merged)) ) - # IT special tokens at fixed ids 65535..65539. The base model only - # uses 65535 (<|endoftext|>); IT adds the four chat tokens. + # IT specials at fixed ids; base only uses <|endoftext|>. special_tokens = { "<|endoftext|>": 65535, "<|end|>": 65536, @@ -8651,10 +8626,7 @@ def set_vocab(self): "<|assistant|>": 65538, "<|system|>": 65539, } - # Decide vocab_size: read from config.json (65540 IT, 65536 base). vocab_size = self.hparams["vocab_size"] - # If we are converting the base model, drop the IT-specific specials - # whose ids are >= base vocab_size. special_tokens = {k: v for k, v in special_tokens.items() if v < vocab_size} reverse_vocab = {idx: tok for tok, idx in {**vocab, **special_tokens}.items()} @@ -8678,7 +8650,6 @@ def set_vocab(self): self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_token_merges(merges) - # Special-token ids. EOS = <|end|> for IT, <|endoftext|> for base. eos_id = special_tokens.get("<|end|>", special_tokens["<|endoftext|>"]) self.gguf_writer.add_eos_token_id(eos_id) self.gguf_writer.add_eot_token_id(eos_id) @@ -8687,8 +8658,6 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_add_eos_token(False) - # Chat template: <|user|>{content}<|end|><|assistant|>{content}<|end|>... - # No newlines, no BOS - matches talkie/src/talkie/chat.py:format_chat. chat_template = ( "{% for m in messages %}<|{{ m.role }}|>{{ m.content }}<|end|>{% endfor %}" "{% if add_generation_prompt %}<|assistant|>{% endif %}" @@ -8696,32 +8665,19 @@ def set_vocab(self): self.gguf_writer.add_chat_template(chat_template) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # RoPE sign correction for q_proj / k_proj. - # Talkie rotates by -theta (sign-flipped sin); llama.cpp NEOX rotates by +theta. - # Pre-multiplying the second-half of head_dim (output dim) by -1 absorbs - # the difference: == - # for D = diag(+1...+1, -1...-1) on head_dim halves. + # RoPE -theta: negate second half of head_dim of W_q/W_k so NEOX matches. n_head = self.hparams["num_attention_heads"] head_dim = self.hparams.get("head_dim") or ( self.hparams["hidden_size"] // n_head ) if name.endswith(("self_attn.q_proj.weight", "self_attn.k_proj.weight")): - w = data_torch - # shape [n_head*head_dim, hidden] - w = w.view(n_head, head_dim, w.shape[-1]).clone() + w = data_torch.view(n_head, head_dim, data_torch.shape[-1]).clone() w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :] data_torch = w.view(n_head * head_dim, -1) - # Talkie's HF state-dict has scalar/gain Parameters whose names do NOT - # end in .weight (raw nn.Parameter). The GGUF tensor-name convention - # requires .weight or .bias suffixes - the C++ loader looks up - # tn(LLM_TENSOR_OUTPUT, "weight") -> "output.weight". Add the suffix - # synthetically so map_tensor_name routes via try_suffixes. - canonical = name - if not canonical.endswith((".weight", ".bias")): - canonical = canonical + ".weight" - new_name = self.map_tensor_name(canonical) - return [(new_name, data_torch)] + # Raw nn.Parameter scalars have no .weight suffix; add one for map_tensor_name. + canonical = name if name.endswith((".weight", ".bias")) else name + ".weight" + return [(self.map_tensor_name(canonical), data_torch)] @ModelBase.register("OlmoeForCausalLM") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e54821c9e75..9276d9ff535 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -567,11 +567,11 @@ class MODEL_TENSOR(IntEnum): ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() LAYER_OUT_SCALE = auto() - ATTN_HEAD_GAIN = auto() # talkie - per-head learnable gain on Q after Q-RMSnorm - ATTN_ACT_GAIN = auto() # talkie - per-block learnable scalar on attn-residual branch - FFN_ACT_GAIN = auto() # talkie - per-block learnable scalar on mlp-residual branch - EMBED_SKIP_SCALE = auto() # talkie - per-block learnable scalar on embedding-skip branch - LM_HEAD_GAIN = auto() # talkie - global learnable scalar on lm_head matrix + ATTN_HEAD_GAIN = auto() # talkie + ATTN_ACT_GAIN = auto() # talkie + FFN_ACT_GAIN = auto() # talkie + EMBED_SKIP_SCALE = auto() # talkie + LM_HEAD_GAIN = auto() # talkie PER_LAYER_TOKEN_EMBD = auto() # gemma3n PER_LAYER_MODEL_PROJ = auto() # gemma3n PER_LAYER_INP_GATE = auto() # gemma3n diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 02e7c3bbcc8..a814a202380 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -138,9 +138,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { } } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { - // Talkie's chat template uses the same role markers as Phi-3 but - // omits the newlines: "<|user|>{c}<|end|><|assistant|>". Detect by - // the absence of "|>\n" sequences. + // Talkie shares Phi-3's role markers but has no newlines. if (tmpl_contains("|>{{ message['content'] }}<|end|>") || tmpl_contains("|>{{ m.content }}<|end|>")) { return LLM_CHAT_TEMPLATE_TALKIE; @@ -928,8 +926,7 @@ int32_t llm_chat_apply_template( ss << "<|begin|>assistant"; } } else if (tmpl == LLM_CHAT_TEMPLATE_TALKIE) { - // Talkie 1930 IT chat template: <|role|>content<|end|>... no newlines, no BOS. - // Matches talkie/src/talkie/chat.py:format_chat exactly. + // <|role|>content<|end|>... no newlines, no BOS. for (auto message : chat) { std::string role(message->role); ss << "<|" << role << "|>" << message->content << "<|end|>"; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5dc4887014a..ad82ce785ba 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1873,10 +1873,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_TALKIE: { - // Talkie's RMSNorm has no learnable weight; eps must be tiny - // to match PyTorch's F.rms_norm default behavior (effective - // eps ~ 0). See TalkieModel.set_gguf_parameters in the - // converter for the rationale. + // Match PyTorch F.rms_norm default (effective eps ~0). ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); if (hparams.f_norm_rms_eps == 0.0f) { hparams.f_norm_rms_eps = 1e-9f; @@ -5080,10 +5077,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_TALKIE: { - // Talkie has no learnable RMSNorm weights anywhere. Per-block - // tensors: q/k/v/o, ffn_gate/up/down, head_gain, attn_act_gain, - // ffn_act_gain, embed_skip_scale. Globals: tok_embd, output (untied), - // lm_head_gain. + // No learnable RMSNorm weights; lm_head untied. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); lm_head_gain = create_tensor(tn(LLM_TENSOR_LM_HEAD_GAIN, "weight"), {1}, 0); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 2d533ab5b27..340a9bf17c2 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -369,10 +369,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { }; break; case LLAMA_VOCAB_PRE_TYPE_TALKIE: - // Talkie tiktoken pre-tokenizer (talkie/src/talkie/tokenizer.py:11-21). - // Note: ordering of alternatives is significant - case-aware leading-cap - // and trailing-cap fragments first, then digits, then punctuation, then - // whitespace fallbacks. + // talkie/src/talkie/tokenizer.py:11-21 regex_exprs = { "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp index 2e93f8ad4ee..7be910f912c 100644 --- a/src/models/talkie.cpp +++ b/src/models/talkie.cpp @@ -1,21 +1,8 @@ #include "models.h" -// Talkie 1930 13B graph builder. -// -// Mirrors talkie/src/talkie/model.py: -// - Weightless RMSNorm everywhere (build_norm with mw=NULL). -// - Pre-attention RMSnorm (talkie line 144). -// - Post-RoPE Q/K RMSnorm with no learnable weight (talkie line 102). -// - Per-head HeadGain on Q after Q-RMSnorm (talkie line 103). -// - Standard SDPA via build_attn. -// - Per-block ActGain scalars on attn-residual and mlp-residual branches. -// - Per-block embed_skip: e_x (post-RMSnorm embedding) added to every layer. -// - Final RMSnorm, lm_head with global lm_head_gain scalar. -// -// The RoPE sign-convention difference between talkie (rotation by -theta) -// and llama.cpp NEOX (rotation by +theta) is absorbed at convert time by -// negating the second half of head_dim of W_q and W_k weights, so this -// graph uses stock NEOX RoPE unchanged. +// Talkie 1930 13B. Mirrors talkie/src/talkie/model.py. +// RoPE sign flip (-theta) is absorbed at convert time by negating +// the second half of head_dim of W_q/W_k. llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v(); @@ -28,11 +15,11 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa inpL = build_inp_embd(model.tok_embd); - // e_x = RMSnorm(embed(input_ids)) - the same e_x is added to every layer. + // e_x = RMSnorm(embd); same e_x added to every layer. ggml_tensor * e_x = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, -1); cb(e_x, "embed_post_norm", -1); - // The residual stream starts as e_x (talkie/src/talkie/model.py:191). + // Residual stream starts as e_x (model.py:191). inpL = e_x; ggml_tensor * inp_pos = build_inp_pos(); @@ -44,7 +31,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; - // Pre-attention RMSnorm (weightless). cur = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, il); cb(cur, "attn_pre_norm", il); @@ -63,8 +49,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - // RoPE - stock NEOX. Sign convention is absorbed in W_q/W_k at - // conversion time (see TalkieModel.modify_tensors). Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -78,16 +62,10 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa cb(Qcur, "Qcur_post_rope", il); cb(Kcur, "Kcur_post_rope", il); - // Weightless K-RMSnorm (talkie line 102). Kcur = build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il); cb(Kcur, "Kcur_post_qknorm", il); - // Q-RMSnorm fused with HeadGain: rms-norm then multiply by per-head - // gain broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens]. - // build_norm emits ggml_rms_norm + ggml_mul as consecutive nodes, - // matching the CUDA RMS_NORM+MUL fusion pattern in - // ggml-cuda::ggml_cuda_op_rms_norm_fused. Same graph as a separate - // ggml_mul; this form keeps the two ops adjacent in the cgraph. + // Q-RMSnorm + per-head gain (RMS_NORM+MUL fusion). ggml_tensor * head_gain = ggml_reshape_3d(ctx0, model.layers[il].attn_head_gain, 1, n_head, 1); Qcur = build_norm(Qcur, head_gain, NULL, LLM_NORM_RMS, il); cb(Qcur, "Qcur_post_headgain", il); @@ -104,7 +82,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa e_x = ggml_get_rows(ctx0, e_x, inp_out_ids); } - // Apply ActGain on attn branch and add residual. cur = ggml_mul(ctx0, cur, model.layers[il].attn_act_gain); cb(cur, "attn_branch_scaled", il); cur = ggml_add(ctx0, cur, inpSA); @@ -112,7 +89,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa ggml_tensor * mlp_in = cur; - // Pre-MLP RMSnorm (weightless). cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, il); cb(cur, "mlp_pre_norm", il); @@ -124,13 +100,12 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "mlp_out", il); - // Apply ActGain on mlp branch and add residual. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_act_gain); cb(cur, "mlp_branch_scaled", il); cur = ggml_add(ctx0, cur, mlp_in); cb(cur, "after_mlp_residual", il); - // Embedding-skip: cur = cur + embed_skip * e_x. + // embed-skip: cur += embed_skip * e_x. ggml_tensor * e_x_scaled = ggml_mul(ctx0, e_x, model.layers[il].embed_skip_scale); cb(e_x_scaled, "embed_skip_branch", il); cur = ggml_add(ctx0, cur, e_x_scaled); @@ -143,14 +118,11 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa } cur = inpL; - // Final RMSnorm (weightless). cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; - // lm_head with global gain: matmul(cur, lm_head_gain * output). - // Reuses the existing build_lora_mm 3-arg form which already handles - // a per-tensor weight scale. + // lm_head with global gain via build_lora_mm 3-arg form. cur = build_lora_mm(model.output, cur, model.lm_head_gain); cb(cur, "result_output", -1); res->t_logits = cur;