From 6480e91e984643c6c0f5a45865cf2ddabe6a912d Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 29 Apr 2026 10:16:03 +0000
Subject: [PATCH 1/4] gguf-py: add talkie architecture constants and tensor
 mappings

Adds MODEL_ARCH.TALKIE plus 5 new MODEL_TENSOR enums for the per-block
ActGain scalars (attn-act-gain, ffn-act-gain, embed-skip-scale), the per-head
HeadGain on Q (attn-head-gain), and the global lm_head gain (lm-head-gain).
Registers HF source names in tensor_mapping.py so the default
modify_tensors path routes them automatically.

Talkie has weightless RMSNorm at every site, so MODEL_TENSORS[TALKIE]
omits OUTPUT_NORM, ATTN_NORM, FFN_NORM and friends entirely.
---
 convert_hf_to_gguf.py          | 134 ++++++++++++++++++++++++++++
 gguf-py/gguf/constants.py      |  28 ++++++
 gguf-py/gguf/tensor_mapping.py |  15 ++++
 src/llama-arch.cpp             |  13 +++
 src/llama-arch.h               |   7 ++
 src/llama-chat.cpp             |  18 ++++
 src/llama-chat.h               |   1 +
 src/llama-model.cpp            |  45 ++++++++++
 src/llama-model.h              |   9 ++
 src/llama-vocab.cpp            |  13 +++
 src/llama-vocab.h              |   1 +
 src/models/models.h            |   4 +
 src/models/talkie.cpp          | 156 +++++++++++++++++++++++++++++++++
 13 files changed, 444 insertions(+)
 create mode 100644 src/models/talkie.cpp

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 90c2b7094c7..6c500772a13 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8579,6 +8579,140 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
 
 
+@ModelBase.register("TalkieForCausalLM")
+class TalkieModel(TextModel):
+    """Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF.
+
+    The architecture mirrors talkie/src/talkie/model.py: weightless RMSNorm at
+    every site, per-block ActGain scalars on attn/mlp branches, embed-skip
+    scalar, per-head HeadGain on Q, scalar lm_head_gain on lm_head, untied
+    raw nn.Parameter lm_head, no biases anywhere.
+
+    The reference RoPE rotates by -theta (sign-flipped vs HF Llama / NEOX).
+    To absorb that without a new RoPE flavor in ggml, we pre-flip the second
+    half of head_dim of W_q and W_k at convert time. Then llama.cpp's NEOX
+    RoPE produces identical attention scores.
+    """
+    model_arch = gguf.MODEL_ARCH.TALKIE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        head_dim = self.hparams.get("head_dim") or (
+            self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+        )
+        self.gguf_writer.add_rope_dimension_count(head_dim)
+        # Talkie uses F.rms_norm with default eps (1e-5).
+        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
+
+    def set_vocab(self):
+        # Custom: read tiktoken vocab.txt directly. No tokenizer.json upstream.
+        from tiktoken.load import load_tiktoken_bpe
+
+        vocab_path = self.dir_model / "vocab.txt"
+        if not vocab_path.exists():
+            raise FileNotFoundError(
+                f"talkie vocab.txt not found at {vocab_path}. The original "
+                "talkie repo ships vocab.txt alongside the checkpoint; the "
+                "converter expects it in the HF safetensors directory."
+            )
+        mergeable_ranks = load_tiktoken_bpe(str(vocab_path))
+        # Filter ranks >= 65535 to leave room for IT specials. Mirrors
+        # talkie/src/talkie/tokenizer.py:54.
+        mergeable_ranks = {k: v for k, v in mergeable_ranks.items() if v < 65535}
+
+        # Reverse-engineer merges via QwenModel's helpers (already vendored
+        # in this file). This is the same pattern as _set_vocab_qwen and
+        # HunYuanMoE / KimiLinear / Deepseek-K2.
+        merges: list[str] = []
+        vocab: dict[str, int] = {}
+        for token_bytes, rank in mergeable_ranks.items():
+            vocab[QwenModel.token_bytes_to_string(token_bytes)] = rank
+            if len(token_bytes) == 1:
+                continue
+            merged = QwenModel.bpe(mergeable_ranks, token_bytes, max_rank=rank)
+            if len(merged) == 2:
+                merges.append(
+                    " ".join(map(QwenModel.token_bytes_to_string, merged))
+                )
+
+        # IT special tokens at fixed ids 65535..65539. The base model only
+        # uses 65535 (<|endoftext|>); IT adds the four chat tokens.
+        special_tokens = {
+            "<|endoftext|>": 65535,
+            "<|end|>": 65536,
+            "<|user|>": 65537,
+            "<|assistant|>": 65538,
+            "<|system|>": 65539,
+        }
+        # Decide vocab_size: read from config.json (65540 IT, 65536 base).
+        vocab_size = self.hparams["vocab_size"]
+        # If we are converting the base model, drop the IT-specific specials
+        # whose ids are >= base vocab_size.
+        special_tokens = {k: v for k, v in special_tokens.items() if v < vocab_size}
+
+        reverse_vocab = {idx: tok for tok, idx in {**vocab, **special_tokens}.items()}
+        tokens: list[str] = []
+        toktypes: list[int] = []
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.UNUSED)
+            else:
+                tok = reverse_vocab[i]
+                tokens.append(tok)
+                if i in special_tokens.values():
+                    toktypes.append(gguf.TokenType.CONTROL)
+                else:
+                    toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre("talkie")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_token_merges(merges)
+
+        # Special-token ids. EOS = <|end|> for IT, <|endoftext|> for base.
+        eos_id = special_tokens.get("<|end|>", special_tokens["<|endoftext|>"])
+        self.gguf_writer.add_eos_token_id(eos_id)
+        self.gguf_writer.add_eot_token_id(eos_id)
+        self.gguf_writer.add_unk_token_id(special_tokens["<|endoftext|>"])
+        self.gguf_writer.add_pad_token_id(special_tokens["<|endoftext|>"])
+        self.gguf_writer.add_add_bos_token(False)
+        self.gguf_writer.add_add_eos_token(False)
+
+        # Chat template: <|user|>{content}<|end|><|assistant|>{content}<|end|>...
+        # No newlines, no BOS - matches talkie/src/talkie/chat.py:format_chat.
+        chat_template = (
+            "{% for m in messages %}<|{{ m.role }}|>{{ m.content }}<|end|>{% endfor %}"
+            "{% if add_generation_prompt %}<|assistant|>{% endif %}"
+        )
+        self.gguf_writer.add_chat_template(chat_template)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # RoPE sign correction for q_proj / k_proj.
+        # Talkie rotates by -theta (sign-flipped sin); llama.cpp NEOX rotates by +theta.
+        # Pre-multiplying the second-half of head_dim (output dim) by -1 absorbs
+        # the difference: <NEOX(D q), NEOX(D k)> == <Talkie(q), Talkie(k)>
+        # for D = diag(+1...+1, -1...-1) on head_dim halves.
+        n_head = self.hparams["num_attention_heads"]
+        head_dim = self.hparams.get("head_dim") or (
+            self.hparams["hidden_size"] // n_head
+        )
+        if name.endswith(("self_attn.q_proj.weight", "self_attn.k_proj.weight")):
+            w = data_torch
+            # shape [n_head*head_dim, hidden]
+            w = w.view(n_head, head_dim, w.shape[-1]).clone()
+            w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :]
+            data_torch = w.view(n_head * head_dim, -1)
+
+        # Scalar tensors are stored as shape [1] in the HF state dict; keep
+        # them 1-D so create_tensor on the C++ side allocates {1}. Same for
+        # head_gain which is shape [n_head].
+        # Default routing handles everything else.
+        return [(self.map_tensor_name(name), data_torch)]
+
+
 @ModelBase.register("OlmoeForCausalLM")
 class OlmoeModel(TextModel):
     model_arch = gguf.MODEL_ARCH.OLMOE
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 83ae51ce9ce..e54821c9e75 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -437,6 +437,7 @@ class MODEL_ARCH(IntEnum):
     OLMO             = auto()
     OLMO2            = auto()
     OLMOE            = auto()
+    TALKIE           = auto()
     OPENELM          = auto()
     ARCTIC           = auto()
     DEEPSEEK         = auto()
@@ -566,6 +567,11 @@ class MODEL_TENSOR(IntEnum):
     ATTN_K_NORM          = auto()
     LAYER_OUT_NORM       = auto()
     LAYER_OUT_SCALE      = auto()
+    ATTN_HEAD_GAIN       = auto() # talkie - per-head learnable gain on Q after Q-RMSnorm
+    ATTN_ACT_GAIN        = auto() # talkie - per-block learnable scalar on attn-residual branch
+    FFN_ACT_GAIN         = auto() # talkie - per-block learnable scalar on mlp-residual branch
+    EMBED_SKIP_SCALE     = auto() # talkie - per-block learnable scalar on embedding-skip branch
+    LM_HEAD_GAIN         = auto() # talkie - global learnable scalar on lm_head matrix
     PER_LAYER_TOKEN_EMBD = auto() # gemma3n
     PER_LAYER_MODEL_PROJ = auto() # gemma3n
     PER_LAYER_INP_GATE   = auto() # gemma3n
@@ -923,6 +929,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.OLMO:             "olmo",
     MODEL_ARCH.OLMO2:            "olmo2",
     MODEL_ARCH.OLMOE:            "olmoe",
+    MODEL_ARCH.TALKIE:           "talkie",
     MODEL_ARCH.OPENELM:          "openelm",
     MODEL_ARCH.ARCTIC:           "arctic",
     MODEL_ARCH.DEEPSEEK:         "deepseek",
@@ -1021,6 +1028,11 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ATTN_GATE:                 "blk.{bid}.attn_gate",
     MODEL_TENSOR.ATTN_Q_NORM:               "blk.{bid}.attn_q_norm",
     MODEL_TENSOR.ATTN_K_NORM:               "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.ATTN_HEAD_GAIN:            "blk.{bid}.attn_head_gain",
+    MODEL_TENSOR.ATTN_ACT_GAIN:             "blk.{bid}.attn_act_gain",
+    MODEL_TENSOR.FFN_ACT_GAIN:              "blk.{bid}.ffn_act_gain",
+    MODEL_TENSOR.EMBED_SKIP_SCALE:          "blk.{bid}.embed_skip_scale",
+    MODEL_TENSOR.LM_HEAD_GAIN:              "lm_head_gain",
     MODEL_TENSOR.ATTN_OUT_NORM:             "blk.{bid}.attn_output_norm",
     MODEL_TENSOR.ATTN_POST_NORM:            "blk.{bid}.post_attention_norm",
     MODEL_TENSOR.FFN_GATE_INP:              "blk.{bid}.ffn_gate_inp",
@@ -2663,6 +2675,22 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.TALKIE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_HEAD_GAIN,
+        MODEL_TENSOR.ATTN_ACT_GAIN,
+        MODEL_TENSOR.FFN_ACT_GAIN,
+        MODEL_TENSOR.EMBED_SKIP_SCALE,
+        MODEL_TENSOR.LM_HEAD_GAIN,
+    ],
     MODEL_ARCH.SEED_OSS: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.ATTN_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 01a9b236000..204626a6148 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -80,6 +80,9 @@ class TensorNameMap:
             "model.transformer.ff_out",  # llada
             "head.decoder",              # modern-bert
         ),
+        MODEL_TENSOR.LM_HEAD_GAIN: (
+            "lm_head_gain",  # talkie
+        ),
         MODEL_TENSOR.DENSE_2_OUT: (
             "dense_2_out",  # embeddinggemma
         ),
@@ -2138,6 +2141,18 @@ class TensorNameMap:
         MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
             "model.layers.{bid}.shared_head.norm",
         ),
+        MODEL_TENSOR.ATTN_HEAD_GAIN: (
+            "model.layers.{bid}.self_attn.head_gain",  # talkie
+        ),
+        MODEL_TENSOR.ATTN_ACT_GAIN: (
+            "model.layers.{bid}.attn_gain",  # talkie
+        ),
+        MODEL_TENSOR.FFN_ACT_GAIN: (
+            "model.layers.{bid}.mlp_gain",  # talkie
+        ),
+        MODEL_TENSOR.EMBED_SKIP_SCALE: (
+            "model.layers.{bid}.embed_skip",  # talkie
+        ),
     }
 
     # architecture-specific block mappings
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 633a66fc665..31cb5e7fc20 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -133,6 +133,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
     { LLM_ARCH_MAINCODER,        "maincoder"        },
     { LLM_ARCH_KIMI_LINEAR,      "kimi-linear"      },
+    { LLM_ARCH_TALKIE,           "talkie"           },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -450,6 +451,12 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_NEXTN_HNORM,                            "blk.%d.nextn.hnorm" },
     { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,                 "blk.%d.nextn.shared_head_head" },
     { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,                 "blk.%d.nextn.shared_head_norm" },
+    // talkie
+    { LLM_TENSOR_ATTN_HEAD_GAIN,                         "blk.%d.attn_head_gain" },
+    { LLM_TENSOR_ATTN_ACT_GAIN,                          "blk.%d.attn_act_gain" },
+    { LLM_TENSOR_FFN_ACT_GAIN,                           "blk.%d.ffn_act_gain" },
+    { LLM_TENSOR_EMBED_SKIP_SCALE,                       "blk.%d.embed_skip_scale" },
+    { LLM_TENSOR_LM_HEAD_GAIN,                           "lm_head_gain" },
     { LLM_TENSOR_ATTN_SUB_NORM,                          "blk.%d.attn_sub_norm" },
     { LLM_TENSOR_FFN_SUB_NORM,                           "blk.%d.ffn_sub_norm" },
     { LLM_TENSOR_DEC_OUTPUT_NORM,                        "dec.output_norm" },
@@ -767,6 +774,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // Nemotron 3 Super
     {LLM_TENSOR_FFN_LATENT_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    // talkie - per-block scalar gains and per-head Q gain; lm_head gain is global.
+    {LLM_TENSOR_ATTN_HEAD_GAIN,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ATTN_ACT_GAIN,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_FFN_ACT_GAIN,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_EMBED_SKIP_SCALE,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_LM_HEAD_GAIN,               {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
 };
 
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 8f335f5c7b3..5e15c6e6f89 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -137,6 +137,7 @@ enum llm_arch {
     LLM_ARCH_LLAMA_EMBED,
     LLM_ARCH_MAINCODER,
     LLM_ARCH_KIMI_LINEAR,
+    LLM_ARCH_TALKIE,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -554,6 +555,12 @@ enum llm_tensor {
     LLM_TENSOR_NEXTN_HNORM,
     LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
     LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+    // talkie - per-block scalars and per-head gain
+    LLM_TENSOR_ATTN_HEAD_GAIN,
+    LLM_TENSOR_ATTN_ACT_GAIN,
+    LLM_TENSOR_FFN_ACT_GAIN,
+    LLM_TENSOR_EMBED_SKIP_SCALE,
+    LLM_TENSOR_LM_HEAD_GAIN,
 };
 
 enum llm_tensor_layer {
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 6554a89b28a..02e7c3bbcc8 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -79,6 +79,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
     { "pangu-embedded",    LLM_CHAT_TEMPLATE_PANGU_EMBED       },
     { "solar-open",        LLM_CHAT_TEMPLATE_SOLAR_OPEN        },
+    { "talkie",            LLM_CHAT_TEMPLATE_TALKIE            },
 };
 
 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -137,6 +138,13 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
             }
         }
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
+        // Talkie's chat template uses the same role markers as Phi-3 but
+        // omits the newlines: "<|user|>{c}<|end|><|assistant|>". Detect by
+        // the absence of "|>\n" sequences.
+        if (tmpl_contains("|>{{ message['content'] }}<|end|>")
+                || tmpl_contains("|>{{ m.content }}<|end|>")) {
+            return LLM_CHAT_TEMPLATE_TALKIE;
+        }
         return LLM_CHAT_TEMPLATE_PHI_3;
     } else if (tmpl_contains("[gMASK]<sop>")) {
         return LLM_CHAT_TEMPLATE_CHATGLM_4;
@@ -919,6 +927,16 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|begin|>assistant";
         }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_TALKIE) {
+        // Talkie 1930 IT chat template: <|role|>content<|end|>... no newlines, no BOS.
+        // Matches talkie/src/talkie/chat.py:format_chat exactly.
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>" << message->content << "<|end|>";
+        }
+        if (add_ass) {
+            ss << "<|assistant|>";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/src/llama-chat.h b/src/llama-chat.h
index 13f936a946c..f41edd1fefa 100644
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -59,6 +59,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_GROK_2,
     LLM_CHAT_TEMPLATE_PANGU_EMBED,
     LLM_CHAT_TEMPLATE_SOLAR_OPEN,
+    LLM_CHAT_TEMPLATE_TALKIE,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9e2a13cbd43..acaf8ffb481 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1871,6 +1871,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_TALKIE:
+            {
+                // Talkie's RMSNorm has no learnable weight; eps comes from the
+                // converter (defaults to torch's F.rms_norm default 1e-5).
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+                if (hparams.f_norm_rms_eps == 0.0f) {
+                    hparams.f_norm_rms_eps = 1e-5f;
+                }
+                hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+                switch (hparams.n_layer) {
+                    case 40: type = LLM_TYPE_13B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_OLMOE:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5062,6 +5076,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
                     }
                 } break;
+            case LLM_ARCH_TALKIE:
+                {
+                    // Talkie has no learnable RMSNorm weights anywhere. Per-block
+                    // tensors: q/k/v/o, ffn_gate/up/down, head_gain, attn_act_gain,
+                    // ffn_act_gain, embed_skip_scale. Globals: tok_embd, output (untied),
+                    // lm_head_gain.
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    output   = create_tensor(tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab}, 0);
+                    lm_head_gain = create_tensor(tn(LLM_TENSOR_LM_HEAD_GAIN, "weight"), {1}, 0);
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff,   n_embd}, 0);
+
+                        layer.attn_head_gain     = create_tensor(tn(LLM_TENSOR_ATTN_HEAD_GAIN,     "weight", i), {n_head}, 0);
+                        layer.attn_act_gain      = create_tensor(tn(LLM_TENSOR_ATTN_ACT_GAIN,      "weight", i), {1}, 0);
+                        layer.ffn_act_gain       = create_tensor(tn(LLM_TENSOR_FFN_ACT_GAIN,       "weight", i), {1}, 0);
+                        layer.embed_skip_scale   = create_tensor(tn(LLM_TENSOR_EMBED_SKIP_SCALE,   "weight", i), {1}, 0);
+                    }
+                } break;
             case LLM_ARCH_SEED_OSS:
                 {
                     const uint32_t head_dim             = hparams.n_embd_head_k();
@@ -8817,6 +8857,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_olmoe>(*this, params);
             } break;
+        case LLM_ARCH_TALKIE:
+            {
+                llm = std::make_unique<llm_build_talkie>(*this, params);
+            } break;
         case LLM_ARCH_OPENELM:
             {
                 llm = std::make_unique<llm_build_openelm>(*this, params);
@@ -9278,6 +9322,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_RND1:
         case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
+        case LLM_ARCH_TALKIE:
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:
         case LLM_ARCH_PHIMOE:
diff --git a/src/llama-model.h b/src/llama-model.h
index 5f101bd6374..494d2fafa49 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -494,6 +494,12 @@ struct llama_layer {
     struct llama_layer_shortconv shortconv;
 
     struct llama_layer_nextn nextn;
+
+    // talkie - per-block scalars and per-head Q gain
+    struct ggml_tensor * attn_head_gain   = nullptr;  // [n_head]
+    struct ggml_tensor * attn_act_gain    = nullptr;  // [1]
+    struct ggml_tensor * ffn_act_gain     = nullptr;  // [1]
+    struct ggml_tensor * embed_skip_scale = nullptr;  // [1]
 };
 
 struct llama_device {
@@ -550,6 +556,9 @@ struct llama_model {
     struct ggml_tensor * per_layer_model_proj = nullptr;
     struct ggml_tensor * per_layer_proj_norm  = nullptr;
 
+    // talkie - global learnable scalar that multiplies the lm_head matrix.
+    struct ggml_tensor * lm_head_gain         = nullptr;  // [1]
+
     std::vector<llama_layer> layers;
 
     //Dense linear projections for SentenceTransformers models like embeddinggemma
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 163f222ef61..2d533ab5b27 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -368,6 +368,15 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_TALKIE:
+                // Talkie tiktoken pre-tokenizer (talkie/src/talkie/tokenizer.py:11-21).
+                // Note: ordering of alternatives is significant - case-aware leading-cap
+                // and trailing-cap fragments first, then digits, then punctuation, then
+                // whitespace fallbacks.
+                regex_exprs = {
+                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
             case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
             case LLAMA_VOCAB_PRE_TYPE_QWEN2:
             case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
@@ -2035,6 +2044,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             } else if (
                 tokenizer_pre == "olmo") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
+            } else if (
+                tokenizer_pre == "talkie") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_TALKIE;
+                clean_spaces = false;
             } else if (
                 tokenizer_pre == "dbrx") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index dd38f45d3a2..c4ac6bc14c2 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
     LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM       = 48,
     LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
     LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
+    LLAMA_VOCAB_PRE_TYPE_TALKIE          = 51,
 };
 
 struct LLM_KV;
diff --git a/src/models/models.h b/src/models/models.h
index 94991c55fe8..3103cdfd388 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -472,6 +472,10 @@ struct llm_build_olmoe : public llm_graph_context {
     llm_build_olmoe(const llama_model & model, const llm_graph_params & params);
 };
 
+struct llm_build_talkie : public llm_graph_context {
+    llm_build_talkie(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_olmo : public llm_graph_context {
     llm_build_olmo(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp
new file mode 100644
index 00000000000..b7bdf386892
--- /dev/null
+++ b/src/models/talkie.cpp
@@ -0,0 +1,156 @@
+#include "models.h"
+
+// Talkie 1930 13B graph builder.
+//
+// Mirrors talkie/src/talkie/model.py:
+//   - Weightless RMSNorm everywhere (build_norm with mw=NULL).
+//   - Pre-attention RMSnorm (talkie line 144).
+//   - Post-RoPE Q/K RMSnorm with no learnable weight (talkie line 102).
+//   - Per-head HeadGain on Q after Q-RMSnorm (talkie line 103).
+//   - Standard SDPA via build_attn.
+//   - Per-block ActGain scalars on attn-residual and mlp-residual branches.
+//   - Per-block embed_skip: e_x (post-RMSnorm embedding) added to every layer.
+//   - Final RMSnorm, lm_head with global lm_head_gain scalar.
+//
+// The RoPE sign-convention difference between talkie (rotation by -theta)
+// and llama.cpp NEOX (rotation by +theta) is absorbed at convert time by
+// negating the second half of head_dim of W_q and W_k weights, so this
+// graph uses stock NEOX RoPE unchanged.
+
+llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // e_x = RMSnorm(embed(input_ids)) - the same e_x is added to every layer.
+    ggml_tensor * e_x = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, -1);
+    cb(e_x, "embed_post_norm", -1);
+
+    // The residual stream starts as e_x (talkie/src/talkie/model.py:191).
+    inpL = e_x;
+
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // Pre-attention RMSnorm (weightless).
+        cur = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_pre_norm", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur_pre_rope", il);
+
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur_pre_rope", il);
+
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            // RoPE - stock NEOX. Sign convention is absorbed in W_q/W_k at
+            // conversion time (see TalkieModel.modify_tensors).
+            Qcur = ggml_rope_ext(
+                ctx0, Qcur, inp_pos, nullptr,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+            );
+            Kcur = ggml_rope_ext(
+                ctx0, Kcur, inp_pos, nullptr,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+            );
+            cb(Qcur, "Qcur_post_rope", il);
+            cb(Kcur, "Kcur_post_rope", il);
+
+            // Weightless Q/K RMSnorm (talkie line 102).
+            Qcur = build_norm(Qcur, NULL, NULL, LLM_NORM_RMS, il);
+            cb(Qcur, "Qcur_post_qknorm", il);
+            Kcur = build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il);
+            cb(Kcur, "Kcur_post_qknorm", il);
+
+            // HeadGain on Q: broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens].
+            ggml_tensor * head_gain = ggml_reshape_3d(ctx0, model.layers[il].attn_head_gain, 1, n_head, 1);
+            Qcur = ggml_mul(ctx0, Qcur, head_gain);
+            cb(Qcur, "Qcur_post_headgain", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, NULL, model.layers[il].wo_s,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf((float) n_embd_head), il);
+            cb(cur, "attn_out", il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            e_x   = ggml_get_rows(ctx0, e_x,   inp_out_ids);
+        }
+
+        // Apply ActGain on attn branch and add residual.
+        cur = ggml_mul(ctx0, cur, model.layers[il].attn_act_gain);
+        cb(cur, "attn_branch_scaled", il);
+        cur = ggml_add(ctx0, cur, inpSA);
+        cb(cur, "after_attn_residual", il);
+
+        ggml_tensor * mlp_in = cur;
+
+        // Pre-MLP RMSnorm (weightless).
+        cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, il);
+        cb(cur, "mlp_pre_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "mlp_out", il);
+
+        // Apply ActGain on mlp branch and add residual.
+        cur = ggml_mul(ctx0, cur, model.layers[il].ffn_act_gain);
+        cb(cur, "mlp_branch_scaled", il);
+        cur = ggml_add(ctx0, cur, mlp_in);
+        cb(cur, "after_mlp_residual", il);
+
+        // Embedding-skip: cur = cur + embed_skip * e_x.
+        ggml_tensor * e_x_scaled = ggml_mul(ctx0, e_x, model.layers[il].embed_skip_scale);
+        cb(e_x_scaled, "embed_skip_branch", il);
+        cur = ggml_add(ctx0, cur, e_x_scaled);
+        cb(cur, "after_embed_skip", il);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        inpL = cur;
+    }
+    cur = inpL;
+
+    // Final RMSnorm (weightless).
+    cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head with global gain: matmul(cur, lm_head_gain * output).
+    // Reuses the existing build_lora_mm 3-arg form which already handles
+    // a per-tensor weight scale.
+    cur = build_lora_mm(model.output, cur, model.lm_head_gain);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}

From d19f0fcd8bdd8aaa92264de9b91883f284d4a120 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 29 Apr 2026 11:10:28 +0000
Subject: [PATCH 2/4] talkie: fix RMSNorm eps to match PyTorch F.rms_norm
 default

Talkie's reference uses F.rms_norm with the default eps. In bf16 PyTorch
that default behaves like eps=0 (output rms == 1.0 to fp32 noise), not
like torch.finfo(input.dtype).eps as the docstring suggests.

Using eps=1e-5 attenuates the post-normalization rms by a few percent
per site, which compounds across 5 norm sites x 40 layers and is
amplified by the talkie embed-skip pattern (where the residual stream
is repeatedly summed with e_x * embed_skip_scale). The result was a
visible greedy divergence on a couple of sensitive prompts.

Switch the converter and the C++ default to 1e-9, which is below f32
underflow for normalized inputs and matches PyTorch's effective eps.
---
 convert_hf_to_gguf.py | 25 ++++++++++++++++++-------
 src/llama-model.cpp   |  8 +++++---
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 6c500772a13..e0f7d5b8eee 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8602,8 +8602,14 @@ def set_gguf_parameters(self):
             self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
         )
         self.gguf_writer.add_rope_dimension_count(head_dim)
-        # Talkie uses F.rms_norm with default eps (1e-5).
-        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
+        # Talkie uses F.rms_norm with default eps. The PyTorch default eps
+        # for F.rms_norm is effectively ~0 (output rms == 1.0 to fp32 noise),
+        # NOT torch.finfo(input.dtype).eps as the docstring suggests. Using
+        # eps=1e-5 attenuates the output rms by ~2% per norm site, which
+        # compounds across 40 layers and 5 norm sites per layer (especially
+        # via the per-layer embed-skip add of `e_x`). Match PyTorch by using
+        # a tiny eps.
+        self.gguf_writer.add_layer_norm_rms_eps(1e-9)
 
     def set_vocab(self):
         # Custom: read tiktoken vocab.txt directly. No tokenizer.json upstream.
@@ -8706,11 +8712,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :]
             data_torch = w.view(n_head * head_dim, -1)
 
-        # Scalar tensors are stored as shape [1] in the HF state dict; keep
-        # them 1-D so create_tensor on the C++ side allocates {1}. Same for
-        # head_gain which is shape [n_head].
-        # Default routing handles everything else.
-        return [(self.map_tensor_name(name), data_torch)]
+        # Talkie's HF state-dict has scalar/gain Parameters whose names do NOT
+        # end in .weight (raw nn.Parameter). The GGUF tensor-name convention
+        # requires .weight or .bias suffixes - the C++ loader looks up
+        # tn(LLM_TENSOR_OUTPUT, "weight") -> "output.weight". Add the suffix
+        # synthetically so map_tensor_name routes via try_suffixes.
+        canonical = name
+        if not canonical.endswith((".weight", ".bias")):
+            canonical = canonical + ".weight"
+        new_name = self.map_tensor_name(canonical)
+        return [(new_name, data_torch)]
 
 
 @ModelBase.register("OlmoeForCausalLM")
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index acaf8ffb481..5dc4887014a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1873,11 +1873,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             } break;
         case LLM_ARCH_TALKIE:
             {
-                // Talkie's RMSNorm has no learnable weight; eps comes from the
-                // converter (defaults to torch's F.rms_norm default 1e-5).
+                // Talkie's RMSNorm has no learnable weight; eps must be tiny
+                // to match PyTorch's F.rms_norm default behavior (effective
+                // eps ~ 0). See TalkieModel.set_gguf_parameters in the
+                // converter for the rationale.
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
                 if (hparams.f_norm_rms_eps == 0.0f) {
-                    hparams.f_norm_rms_eps = 1e-5f;
+                    hparams.f_norm_rms_eps = 1e-9f;
                 }
                 hparams.swa_type = LLAMA_SWA_TYPE_NONE;
                 switch (hparams.n_layer) {

From 73b0094fbde4da6a31b8d942b818e26c905a123b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 29 Apr 2026 12:10:19 +0000
Subject: [PATCH 3/4] talkie: fold per-head HeadGain into the Q-RMSnorm
 build_norm call

Replaces the separate `ggml_mul(Qcur, head_gain)` with the equivalent
`build_norm(Qcur, head_gain, ...)` 2-arg form. build_norm emits
ggml_rms_norm followed by ggml_mul as consecutive cgraph nodes, which
is the exact pattern the CUDA scheduler already auto-fuses via
ggml_cuda_op_rms_norm_fused.

Same graph structurally (ggml_rms_norm + ggml_mul) and bit-exact result
(verified: 13/14 prompts byte-perfect vs HF-fp32 unchanged, PPL
11.7523 unchanged). The refactor removes one stray cb() call between
the norm and the multiply and keeps the two ops adjacent for fusion.
---
 src/models/talkie.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp
index b7bdf386892..2e93f8ad4ee 100644
--- a/src/models/talkie.cpp
+++ b/src/models/talkie.cpp
@@ -78,15 +78,18 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
             cb(Qcur, "Qcur_post_rope", il);
             cb(Kcur, "Kcur_post_rope", il);
 
-            // Weightless Q/K RMSnorm (talkie line 102).
-            Qcur = build_norm(Qcur, NULL, NULL, LLM_NORM_RMS, il);
-            cb(Qcur, "Qcur_post_qknorm", il);
+            // Weightless K-RMSnorm (talkie line 102).
             Kcur = build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il);
             cb(Kcur, "Kcur_post_qknorm", il);
 
-            // HeadGain on Q: broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens].
+            // Q-RMSnorm fused with HeadGain: rms-norm then multiply by per-head
+            // gain broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens].
+            // build_norm emits ggml_rms_norm + ggml_mul as consecutive nodes,
+            // matching the CUDA RMS_NORM+MUL fusion pattern in
+            // ggml-cuda::ggml_cuda_op_rms_norm_fused. Same graph as a separate
+            // ggml_mul; this form keeps the two ops adjacent in the cgraph.
             ggml_tensor * head_gain = ggml_reshape_3d(ctx0, model.layers[il].attn_head_gain, 1, n_head, 1);
-            Qcur = ggml_mul(ctx0, Qcur, head_gain);
+            Qcur = build_norm(Qcur, head_gain, NULL, LLM_NORM_RMS, il);
             cb(Qcur, "Qcur_post_headgain", il);
 
             cur = build_attn(inp_attn,

From 45c4b137e538257895e7a00f68a1a872d525f3e7 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 29 Apr 2026 16:14:57 +0000
Subject: [PATCH 4/4] talkie: trim verbose comments

---
 convert_hf_to_gguf.py     | 68 +++++++--------------------------------
 gguf-py/gguf/constants.py | 10 +++---
 src/llama-chat.cpp        |  7 ++--
 src/llama-model.cpp       | 10 ++----
 src/llama-vocab.cpp       |  5 +--
 src/models/talkie.cpp     | 44 +++++--------------------
 6 files changed, 30 insertions(+), 114 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e0f7d5b8eee..8e12ed36aec 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8581,18 +8581,7 @@ def set_gguf_parameters(self):
 
 @ModelBase.register("TalkieForCausalLM")
 class TalkieModel(TextModel):
-    """Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF.
-
-    The architecture mirrors talkie/src/talkie/model.py: weightless RMSNorm at
-    every site, per-block ActGain scalars on attn/mlp branches, embed-skip
-    scalar, per-head HeadGain on Q, scalar lm_head_gain on lm_head, untied
-    raw nn.Parameter lm_head, no biases anywhere.
-
-    The reference RoPE rotates by -theta (sign-flipped vs HF Llama / NEOX).
-    To absorb that without a new RoPE flavor in ggml, we pre-flip the second
-    half of head_dim of W_q and W_k at convert time. Then llama.cpp's NEOX
-    RoPE produces identical attention scores.
-    """
+    """Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF."""
     model_arch = gguf.MODEL_ARCH.TALKIE
 
     def set_gguf_parameters(self):
@@ -8602,34 +8591,21 @@ def set_gguf_parameters(self):
             self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
         )
         self.gguf_writer.add_rope_dimension_count(head_dim)
-        # Talkie uses F.rms_norm with default eps. The PyTorch default eps
-        # for F.rms_norm is effectively ~0 (output rms == 1.0 to fp32 noise),
-        # NOT torch.finfo(input.dtype).eps as the docstring suggests. Using
-        # eps=1e-5 attenuates the output rms by ~2% per norm site, which
-        # compounds across 40 layers and 5 norm sites per layer (especially
-        # via the per-layer embed-skip add of `e_x`). Match PyTorch by using
-        # a tiny eps.
+        # Match PyTorch F.rms_norm default (effective eps ~0 on bf16).
         self.gguf_writer.add_layer_norm_rms_eps(1e-9)
 
     def set_vocab(self):
-        # Custom: read tiktoken vocab.txt directly. No tokenizer.json upstream.
+        # Read tiktoken vocab.txt directly (no tokenizer.json upstream).
         from tiktoken.load import load_tiktoken_bpe
 
         vocab_path = self.dir_model / "vocab.txt"
         if not vocab_path.exists():
-            raise FileNotFoundError(
-                f"talkie vocab.txt not found at {vocab_path}. The original "
-                "talkie repo ships vocab.txt alongside the checkpoint; the "
-                "converter expects it in the HF safetensors directory."
-            )
+            raise FileNotFoundError(f"vocab.txt not found at {vocab_path}")
         mergeable_ranks = load_tiktoken_bpe(str(vocab_path))
-        # Filter ranks >= 65535 to leave room for IT specials. Mirrors
-        # talkie/src/talkie/tokenizer.py:54.
+        # Drop ranks >= 65535 (specials live there). See tokenizer.py:54.
         mergeable_ranks = {k: v for k, v in mergeable_ranks.items() if v < 65535}
 
-        # Reverse-engineer merges via QwenModel's helpers (already vendored
-        # in this file). This is the same pattern as _set_vocab_qwen and
-        # HunYuanMoE / KimiLinear / Deepseek-K2.
+        # Reverse-engineer merges via QwenModel helpers.
         merges: list[str] = []
         vocab: dict[str, int] = {}
         for token_bytes, rank in mergeable_ranks.items():
@@ -8642,8 +8618,7 @@ def set_vocab(self):
                     " ".join(map(QwenModel.token_bytes_to_string, merged))
                 )
 
-        # IT special tokens at fixed ids 65535..65539. The base model only
-        # uses 65535 (<|endoftext|>); IT adds the four chat tokens.
+        # IT specials at fixed ids; base only uses <|endoftext|>.
         special_tokens = {
             "<|endoftext|>": 65535,
             "<|end|>": 65536,
@@ -8651,10 +8626,7 @@ def set_vocab(self):
             "<|assistant|>": 65538,
             "<|system|>": 65539,
         }
-        # Decide vocab_size: read from config.json (65540 IT, 65536 base).
         vocab_size = self.hparams["vocab_size"]
-        # If we are converting the base model, drop the IT-specific specials
-        # whose ids are >= base vocab_size.
         special_tokens = {k: v for k, v in special_tokens.items() if v < vocab_size}
 
         reverse_vocab = {idx: tok for tok, idx in {**vocab, **special_tokens}.items()}
@@ -8678,7 +8650,6 @@ def set_vocab(self):
         self.gguf_writer.add_token_types(toktypes)
         self.gguf_writer.add_token_merges(merges)
 
-        # Special-token ids. EOS = <|end|> for IT, <|endoftext|> for base.
         eos_id = special_tokens.get("<|end|>", special_tokens["<|endoftext|>"])
         self.gguf_writer.add_eos_token_id(eos_id)
         self.gguf_writer.add_eot_token_id(eos_id)
@@ -8687,8 +8658,6 @@ def set_vocab(self):
         self.gguf_writer.add_add_bos_token(False)
         self.gguf_writer.add_add_eos_token(False)
 
-        # Chat template: <|user|>{content}<|end|><|assistant|>{content}<|end|>...
-        # No newlines, no BOS - matches talkie/src/talkie/chat.py:format_chat.
         chat_template = (
             "{% for m in messages %}<|{{ m.role }}|>{{ m.content }}<|end|>{% endfor %}"
             "{% if add_generation_prompt %}<|assistant|>{% endif %}"
@@ -8696,32 +8665,19 @@ def set_vocab(self):
         self.gguf_writer.add_chat_template(chat_template)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # RoPE sign correction for q_proj / k_proj.
-        # Talkie rotates by -theta (sign-flipped sin); llama.cpp NEOX rotates by +theta.
-        # Pre-multiplying the second-half of head_dim (output dim) by -1 absorbs
-        # the difference: <NEOX(D q), NEOX(D k)> == <Talkie(q), Talkie(k)>
-        # for D = diag(+1...+1, -1...-1) on head_dim halves.
+        # RoPE -theta: negate second half of head_dim of W_q/W_k so NEOX matches.
         n_head = self.hparams["num_attention_heads"]
         head_dim = self.hparams.get("head_dim") or (
             self.hparams["hidden_size"] // n_head
         )
         if name.endswith(("self_attn.q_proj.weight", "self_attn.k_proj.weight")):
-            w = data_torch
-            # shape [n_head*head_dim, hidden]
-            w = w.view(n_head, head_dim, w.shape[-1]).clone()
+            w = data_torch.view(n_head, head_dim, data_torch.shape[-1]).clone()
             w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :]
             data_torch = w.view(n_head * head_dim, -1)
 
-        # Talkie's HF state-dict has scalar/gain Parameters whose names do NOT
-        # end in .weight (raw nn.Parameter). The GGUF tensor-name convention
-        # requires .weight or .bias suffixes - the C++ loader looks up
-        # tn(LLM_TENSOR_OUTPUT, "weight") -> "output.weight". Add the suffix
-        # synthetically so map_tensor_name routes via try_suffixes.
-        canonical = name
-        if not canonical.endswith((".weight", ".bias")):
-            canonical = canonical + ".weight"
-        new_name = self.map_tensor_name(canonical)
-        return [(new_name, data_torch)]
+        # Raw nn.Parameter scalars have no .weight suffix; add one for map_tensor_name.
+        canonical = name if name.endswith((".weight", ".bias")) else name + ".weight"
+        return [(self.map_tensor_name(canonical), data_torch)]
 
 
 @ModelBase.register("OlmoeForCausalLM")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index e54821c9e75..9276d9ff535 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -567,11 +567,11 @@ class MODEL_TENSOR(IntEnum):
     ATTN_K_NORM          = auto()
     LAYER_OUT_NORM       = auto()
     LAYER_OUT_SCALE      = auto()
-    ATTN_HEAD_GAIN       = auto() # talkie - per-head learnable gain on Q after Q-RMSnorm
-    ATTN_ACT_GAIN        = auto() # talkie - per-block learnable scalar on attn-residual branch
-    FFN_ACT_GAIN         = auto() # talkie - per-block learnable scalar on mlp-residual branch
-    EMBED_SKIP_SCALE     = auto() # talkie - per-block learnable scalar on embedding-skip branch
-    LM_HEAD_GAIN         = auto() # talkie - global learnable scalar on lm_head matrix
+    ATTN_HEAD_GAIN       = auto() # talkie
+    ATTN_ACT_GAIN        = auto() # talkie
+    FFN_ACT_GAIN         = auto() # talkie
+    EMBED_SKIP_SCALE     = auto() # talkie
+    LM_HEAD_GAIN         = auto() # talkie
     PER_LAYER_TOKEN_EMBD = auto() # gemma3n
     PER_LAYER_MODEL_PROJ = auto() # gemma3n
     PER_LAYER_INP_GATE   = auto() # gemma3n
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 02e7c3bbcc8..a814a202380 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -138,9 +138,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
             }
         }
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
-        // Talkie's chat template uses the same role markers as Phi-3 but
-        // omits the newlines: "<|user|>{c}<|end|><|assistant|>". Detect by
-        // the absence of "|>\n" sequences.
+        // Talkie shares Phi-3's role markers but has no newlines.
         if (tmpl_contains("|>{{ message['content'] }}<|end|>")
                 || tmpl_contains("|>{{ m.content }}<|end|>")) {
             return LLM_CHAT_TEMPLATE_TALKIE;
@@ -928,8 +926,7 @@ int32_t llm_chat_apply_template(
             ss << "<|begin|>assistant";
         }
     } else if (tmpl == LLM_CHAT_TEMPLATE_TALKIE) {
-        // Talkie 1930 IT chat template: <|role|>content<|end|>... no newlines, no BOS.
-        // Matches talkie/src/talkie/chat.py:format_chat exactly.
+        // <|role|>content<|end|>... no newlines, no BOS.
         for (auto message : chat) {
             std::string role(message->role);
             ss << "<|" << role << "|>" << message->content << "<|end|>";
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5dc4887014a..ad82ce785ba 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1873,10 +1873,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             } break;
         case LLM_ARCH_TALKIE:
             {
-                // Talkie's RMSNorm has no learnable weight; eps must be tiny
-                // to match PyTorch's F.rms_norm default behavior (effective
-                // eps ~ 0). See TalkieModel.set_gguf_parameters in the
-                // converter for the rationale.
+                // Match PyTorch F.rms_norm default (effective eps ~0).
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
                 if (hparams.f_norm_rms_eps == 0.0f) {
                     hparams.f_norm_rms_eps = 1e-9f;
@@ -5080,10 +5077,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_TALKIE:
                 {
-                    // Talkie has no learnable RMSNorm weights anywhere. Per-block
-                    // tensors: q/k/v/o, ffn_gate/up/down, head_gain, attn_act_gain,
-                    // ffn_act_gain, embed_skip_scale. Globals: tok_embd, output (untied),
-                    // lm_head_gain.
+                    // No learnable RMSNorm weights; lm_head untied.
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                     output   = create_tensor(tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab}, 0);
                     lm_head_gain = create_tensor(tn(LLM_TENSOR_LM_HEAD_GAIN, "weight"), {1}, 0);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 2d533ab5b27..340a9bf17c2 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -369,10 +369,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                 };
                 break;
             case LLAMA_VOCAB_PRE_TYPE_TALKIE:
-                // Talkie tiktoken pre-tokenizer (talkie/src/talkie/tokenizer.py:11-21).
-                // Note: ordering of alternatives is significant - case-aware leading-cap
-                // and trailing-cap fragments first, then digits, then punctuation, then
-                // whitespace fallbacks.
+                // talkie/src/talkie/tokenizer.py:11-21
                 regex_exprs = {
                     "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
diff --git a/src/models/talkie.cpp b/src/models/talkie.cpp
index 2e93f8ad4ee..7be910f912c 100644
--- a/src/models/talkie.cpp
+++ b/src/models/talkie.cpp
@@ -1,21 +1,8 @@
 #include "models.h"
 
-// Talkie 1930 13B graph builder.
-//
-// Mirrors talkie/src/talkie/model.py:
-//   - Weightless RMSNorm everywhere (build_norm with mw=NULL).
-//   - Pre-attention RMSnorm (talkie line 144).
-//   - Post-RoPE Q/K RMSnorm with no learnable weight (talkie line 102).
-//   - Per-head HeadGain on Q after Q-RMSnorm (talkie line 103).
-//   - Standard SDPA via build_attn.
-//   - Per-block ActGain scalars on attn-residual and mlp-residual branches.
-//   - Per-block embed_skip: e_x (post-RMSnorm embedding) added to every layer.
-//   - Final RMSnorm, lm_head with global lm_head_gain scalar.
-//
-// The RoPE sign-convention difference between talkie (rotation by -theta)
-// and llama.cpp NEOX (rotation by +theta) is absorbed at convert time by
-// negating the second half of head_dim of W_q and W_k weights, so this
-// graph uses stock NEOX RoPE unchanged.
+// Talkie 1930 13B. Mirrors talkie/src/talkie/model.py.
+// RoPE sign flip (-theta) is absorbed at convert time by negating
+// the second half of head_dim of W_q/W_k.
 
 llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v();
@@ -28,11 +15,11 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
 
     inpL = build_inp_embd(model.tok_embd);
 
-    // e_x = RMSnorm(embed(input_ids)) - the same e_x is added to every layer.
+    // e_x = RMSnorm(embd); same e_x added to every layer.
     ggml_tensor * e_x = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, -1);
     cb(e_x, "embed_post_norm", -1);
 
-    // The residual stream starts as e_x (talkie/src/talkie/model.py:191).
+    // Residual stream starts as e_x (model.py:191).
     inpL = e_x;
 
     ggml_tensor * inp_pos = build_inp_pos();
@@ -44,7 +31,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
     for (int il = 0; il < n_layer; ++il) {
         ggml_tensor * inpSA = inpL;
 
-        // Pre-attention RMSnorm (weightless).
         cur = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, il);
         cb(cur, "attn_pre_norm", il);
 
@@ -63,8 +49,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
             Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
             Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-            // RoPE - stock NEOX. Sign convention is absorbed in W_q/W_k at
-            // conversion time (see TalkieModel.modify_tensors).
             Qcur = ggml_rope_ext(
                 ctx0, Qcur, inp_pos, nullptr,
                 n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -78,16 +62,10 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
             cb(Qcur, "Qcur_post_rope", il);
             cb(Kcur, "Kcur_post_rope", il);
 
-            // Weightless K-RMSnorm (talkie line 102).
             Kcur = build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il);
             cb(Kcur, "Kcur_post_qknorm", il);
 
-            // Q-RMSnorm fused with HeadGain: rms-norm then multiply by per-head
-            // gain broadcast [1, n_head, 1] over [head_dim, n_head, n_tokens].
-            // build_norm emits ggml_rms_norm + ggml_mul as consecutive nodes,
-            // matching the CUDA RMS_NORM+MUL fusion pattern in
-            // ggml-cuda::ggml_cuda_op_rms_norm_fused. Same graph as a separate
-            // ggml_mul; this form keeps the two ops adjacent in the cgraph.
+            // Q-RMSnorm + per-head gain (RMS_NORM+MUL fusion).
             ggml_tensor * head_gain = ggml_reshape_3d(ctx0, model.layers[il].attn_head_gain, 1, n_head, 1);
             Qcur = build_norm(Qcur, head_gain, NULL, LLM_NORM_RMS, il);
             cb(Qcur, "Qcur_post_headgain", il);
@@ -104,7 +82,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
             e_x   = ggml_get_rows(ctx0, e_x,   inp_out_ids);
         }
 
-        // Apply ActGain on attn branch and add residual.
         cur = ggml_mul(ctx0, cur, model.layers[il].attn_act_gain);
         cb(cur, "attn_branch_scaled", il);
         cur = ggml_add(ctx0, cur, inpSA);
@@ -112,7 +89,6 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
 
         ggml_tensor * mlp_in = cur;
 
-        // Pre-MLP RMSnorm (weightless).
         cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, il);
         cb(cur, "mlp_pre_norm", il);
 
@@ -124,13 +100,12 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
                 LLM_FFN_SILU, LLM_FFN_PAR, il);
         cb(cur, "mlp_out", il);
 
-        // Apply ActGain on mlp branch and add residual.
         cur = ggml_mul(ctx0, cur, model.layers[il].ffn_act_gain);
         cb(cur, "mlp_branch_scaled", il);
         cur = ggml_add(ctx0, cur, mlp_in);
         cb(cur, "after_mlp_residual", il);
 
-        // Embedding-skip: cur = cur + embed_skip * e_x.
+        // embed-skip: cur += embed_skip * e_x.
         ggml_tensor * e_x_scaled = ggml_mul(ctx0, e_x, model.layers[il].embed_skip_scale);
         cb(e_x_scaled, "embed_skip_branch", il);
         cur = ggml_add(ctx0, cur, e_x_scaled);
@@ -143,14 +118,11 @@ llm_build_talkie::llm_build_talkie(const llama_model & model, const llm_graph_pa
     }
     cur = inpL;
 
-    // Final RMSnorm (weightless).
     cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, -1);
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
 
-    // lm_head with global gain: matmul(cur, lm_head_gain * output).
-    // Reuses the existing build_lora_mm 3-arg form which already handles
-    // a per-tensor weight scale.
+    // lm_head with global gain via build_lora_mm 3-arg form.
     cur = build_lora_mm(model.output, cur, model.lm_head_gain);
     cb(cur, "result_output", -1);
     res->t_logits = cur;