Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8579,6 +8579,107 @@ def set_gguf_parameters(self):
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)


@ModelBase.register("TalkieForCausalLM")
class TalkieModel(TextModel):
"""Convert talkie-lm/talkie-1930-13b-{base,it} to GGUF."""
model_arch = gguf.MODEL_ARCH.TALKIE

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
head_dim = self.hparams.get("head_dim") or (
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
)
self.gguf_writer.add_rope_dimension_count(head_dim)
# Match PyTorch F.rms_norm default (effective eps ~0 on bf16).
self.gguf_writer.add_layer_norm_rms_eps(1e-9)

def set_vocab(self):
# Read tiktoken vocab.txt directly (no tokenizer.json upstream).
from tiktoken.load import load_tiktoken_bpe

vocab_path = self.dir_model / "vocab.txt"
if not vocab_path.exists():
raise FileNotFoundError(f"vocab.txt not found at {vocab_path}")
mergeable_ranks = load_tiktoken_bpe(str(vocab_path))
# Drop ranks >= 65535 (specials live there). See tokenizer.py:54.
mergeable_ranks = {k: v for k, v in mergeable_ranks.items() if v < 65535}

# Reverse-engineer merges via QwenModel helpers.
merges: list[str] = []
vocab: dict[str, int] = {}
for token_bytes, rank in mergeable_ranks.items():
vocab[QwenModel.token_bytes_to_string(token_bytes)] = rank
if len(token_bytes) == 1:
continue
merged = QwenModel.bpe(mergeable_ranks, token_bytes, max_rank=rank)
if len(merged) == 2:
merges.append(
" ".join(map(QwenModel.token_bytes_to_string, merged))
)

# IT specials at fixed ids; base only uses <|endoftext|>.
special_tokens = {
"<|endoftext|>": 65535,
"<|end|>": 65536,
"<|user|>": 65537,
"<|assistant|>": 65538,
"<|system|>": 65539,
}
vocab_size = self.hparams["vocab_size"]
special_tokens = {k: v for k, v in special_tokens.items() if v < vocab_size}

reverse_vocab = {idx: tok for tok, idx in {**vocab, **special_tokens}.items()}
tokens: list[str] = []
toktypes: list[int] = []
for i in range(vocab_size):
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.UNUSED)
else:
tok = reverse_vocab[i]
tokens.append(tok)
if i in special_tokens.values():
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.NORMAL)

self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre("talkie")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_token_merges(merges)

eos_id = special_tokens.get("<|end|>", special_tokens["<|endoftext|>"])
self.gguf_writer.add_eos_token_id(eos_id)
self.gguf_writer.add_eot_token_id(eos_id)
self.gguf_writer.add_unk_token_id(special_tokens["<|endoftext|>"])
self.gguf_writer.add_pad_token_id(special_tokens["<|endoftext|>"])
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_add_eos_token(False)

chat_template = (
"{% for m in messages %}<|{{ m.role }}|>{{ m.content }}<|end|>{% endfor %}"
"{% if add_generation_prompt %}<|assistant|>{% endif %}"
)
self.gguf_writer.add_chat_template(chat_template)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# RoPE -theta: negate second half of head_dim of W_q/W_k so NEOX matches.
n_head = self.hparams["num_attention_heads"]
head_dim = self.hparams.get("head_dim") or (
self.hparams["hidden_size"] // n_head
)
if name.endswith(("self_attn.q_proj.weight", "self_attn.k_proj.weight")):
w = data_torch.view(n_head, head_dim, data_torch.shape[-1]).clone()
w[:, head_dim // 2 :, :] = -w[:, head_dim // 2 :, :]
data_torch = w.view(n_head * head_dim, -1)

# Raw nn.Parameter scalars have no .weight suffix; add one for map_tensor_name.
canonical = name if name.endswith((".weight", ".bias")) else name + ".weight"
return [(self.map_tensor_name(canonical), data_torch)]


@ModelBase.register("OlmoeForCausalLM")
class OlmoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.OLMOE
Expand Down
28 changes: 28 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,7 @@ class MODEL_ARCH(IntEnum):
OLMO = auto()
OLMO2 = auto()
OLMOE = auto()
TALKIE = auto()
OPENELM = auto()
ARCTIC = auto()
DEEPSEEK = auto()
Expand Down Expand Up @@ -566,6 +567,11 @@ class MODEL_TENSOR(IntEnum):
ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto()
LAYER_OUT_SCALE = auto()
ATTN_HEAD_GAIN = auto() # talkie
ATTN_ACT_GAIN = auto() # talkie
FFN_ACT_GAIN = auto() # talkie
EMBED_SKIP_SCALE = auto() # talkie
LM_HEAD_GAIN = auto() # talkie
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
PER_LAYER_MODEL_PROJ = auto() # gemma3n
PER_LAYER_INP_GATE = auto() # gemma3n
Expand Down Expand Up @@ -923,6 +929,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.TALKIE: "talkie",
MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK: "deepseek",
Expand Down Expand Up @@ -1021,6 +1028,11 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.ATTN_HEAD_GAIN: "blk.{bid}.attn_head_gain",
MODEL_TENSOR.ATTN_ACT_GAIN: "blk.{bid}.attn_act_gain",
MODEL_TENSOR.FFN_ACT_GAIN: "blk.{bid}.ffn_act_gain",
MODEL_TENSOR.EMBED_SKIP_SCALE: "blk.{bid}.embed_skip_scale",
MODEL_TENSOR.LM_HEAD_GAIN: "lm_head_gain",
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
Expand Down Expand Up @@ -2663,6 +2675,22 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.TALKIE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_HEAD_GAIN,
MODEL_TENSOR.ATTN_ACT_GAIN,
MODEL_TENSOR.FFN_ACT_GAIN,
MODEL_TENSOR.EMBED_SKIP_SCALE,
MODEL_TENSOR.LM_HEAD_GAIN,
],
MODEL_ARCH.SEED_OSS: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.ATTN_NORM,
Expand Down
15 changes: 15 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ class TensorNameMap:
"model.transformer.ff_out", # llada
"head.decoder", # modern-bert
),
MODEL_TENSOR.LM_HEAD_GAIN: (
"lm_head_gain", # talkie
),
MODEL_TENSOR.DENSE_2_OUT: (
"dense_2_out", # embeddinggemma
),
Expand Down Expand Up @@ -2138,6 +2141,18 @@ class TensorNameMap:
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
"model.layers.{bid}.shared_head.norm",
),
MODEL_TENSOR.ATTN_HEAD_GAIN: (
"model.layers.{bid}.self_attn.head_gain", # talkie
),
MODEL_TENSOR.ATTN_ACT_GAIN: (
"model.layers.{bid}.attn_gain", # talkie
),
MODEL_TENSOR.FFN_ACT_GAIN: (
"model.layers.{bid}.mlp_gain", # talkie
),
MODEL_TENSOR.EMBED_SKIP_SCALE: (
"model.layers.{bid}.embed_skip", # talkie
),
}

# architecture-specific block mappings
Expand Down
13 changes: 13 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_MAINCODER, "maincoder" },
{ LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
{ LLM_ARCH_TALKIE, "talkie" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};

Expand Down Expand Up @@ -450,6 +451,12 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
// talkie
{ LLM_TENSOR_ATTN_HEAD_GAIN, "blk.%d.attn_head_gain" },
{ LLM_TENSOR_ATTN_ACT_GAIN, "blk.%d.attn_act_gain" },
{ LLM_TENSOR_FFN_ACT_GAIN, "blk.%d.ffn_act_gain" },
{ LLM_TENSOR_EMBED_SKIP_SCALE, "blk.%d.embed_skip_scale" },
{ LLM_TENSOR_LM_HEAD_GAIN, "lm_head_gain" },
{ LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
{ LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" },
Expand Down Expand Up @@ -767,6 +774,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// Nemotron 3 Super
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// talkie - per-block scalar gains and per-head Q gain; lm_head gain is global.
{LLM_TENSOR_ATTN_HEAD_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_ATTN_ACT_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_FFN_ACT_GAIN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_EMBED_SKIP_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_LM_HEAD_GAIN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
};

LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
Expand Down
7 changes: 7 additions & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ enum llm_arch {
LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_MAINCODER,
LLM_ARCH_KIMI_LINEAR,
LLM_ARCH_TALKIE,
LLM_ARCH_UNKNOWN,
};

Expand Down Expand Up @@ -554,6 +555,12 @@ enum llm_tensor {
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
// talkie - per-block scalars and per-head gain
LLM_TENSOR_ATTN_HEAD_GAIN,
LLM_TENSOR_ATTN_ACT_GAIN,
LLM_TENSOR_FFN_ACT_GAIN,
LLM_TENSOR_EMBED_SKIP_SCALE,
LLM_TENSOR_LM_HEAD_GAIN,
};

enum llm_tensor_layer {
Expand Down
15 changes: 15 additions & 0 deletions src/llama-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
{ "pangu-embedded", LLM_CHAT_TEMPLATE_PANGU_EMBED },
{ "solar-open", LLM_CHAT_TEMPLATE_SOLAR_OPEN },
{ "talkie", LLM_CHAT_TEMPLATE_TALKIE },
};

llm_chat_template llm_chat_template_from_str(const std::string & name) {
Expand Down Expand Up @@ -137,6 +138,11 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
}
}
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
// Talkie shares Phi-3's role markers but has no newlines.
if (tmpl_contains("|>{{ message['content'] }}<|end|>")
|| tmpl_contains("|>{{ m.content }}<|end|>")) {
return LLM_CHAT_TEMPLATE_TALKIE;
}
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4;
Expand Down Expand Up @@ -919,6 +925,15 @@ int32_t llm_chat_apply_template(
if (add_ass) {
ss << "<|begin|>assistant";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_TALKIE) {
// <|role|>content<|end|>... no newlines, no BOS.
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << message->content << "<|end|>";
}
if (add_ass) {
ss << "<|assistant|>";
}
} else {
// template not supported
return -1;
Expand Down
1 change: 1 addition & 0 deletions src/llama-chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_PANGU_EMBED,
LLM_CHAT_TEMPLATE_SOLAR_OPEN,
LLM_CHAT_TEMPLATE_TALKIE,
LLM_CHAT_TEMPLATE_UNKNOWN,
};

Expand Down
41 changes: 41 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1871,6 +1871,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_TALKIE:
{
// Match PyTorch F.rms_norm default (effective eps ~0).
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
if (hparams.f_norm_rms_eps == 0.0f) {
hparams.f_norm_rms_eps = 1e-9f;
}
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
switch (hparams.n_layer) {
case 40: type = LLM_TYPE_13B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_OLMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
Expand Down Expand Up @@ -5062,6 +5075,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
}
} break;
case LLM_ARCH_TALKIE:
{
// No learnable RMSNorm weights; lm_head untied.
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
lm_head_gain = create_tensor(tn(LLM_TENSOR_LM_HEAD_GAIN, "weight"), {1}, 0);

for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];

create_tensor_qkv(layer, i, n_embd, n_embd, n_embd_gqa, n_embd_gqa, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);

layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);

layer.attn_head_gain = create_tensor(tn(LLM_TENSOR_ATTN_HEAD_GAIN, "weight", i), {n_head}, 0);
layer.attn_act_gain = create_tensor(tn(LLM_TENSOR_ATTN_ACT_GAIN, "weight", i), {1}, 0);
layer.ffn_act_gain = create_tensor(tn(LLM_TENSOR_FFN_ACT_GAIN, "weight", i), {1}, 0);
layer.embed_skip_scale = create_tensor(tn(LLM_TENSOR_EMBED_SKIP_SCALE, "weight", i), {1}, 0);
}
} break;
case LLM_ARCH_SEED_OSS:
{
const uint32_t head_dim = hparams.n_embd_head_k();
Expand Down Expand Up @@ -8817,6 +8853,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_olmoe>(*this, params);
} break;
case LLM_ARCH_TALKIE:
{
llm = std::make_unique<llm_build_talkie>(*this, params);
} break;
case LLM_ARCH_OPENELM:
{
llm = std::make_unique<llm_build_openelm>(*this, params);
Expand Down Expand Up @@ -9278,6 +9318,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_RND1:
case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE:
case LLM_ARCH_TALKIE:
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_PHIMOE:
Expand Down
9 changes: 9 additions & 0 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,12 @@ struct llama_layer {
struct llama_layer_shortconv shortconv;

struct llama_layer_nextn nextn;

// talkie - per-block scalars and per-head Q gain
struct ggml_tensor * attn_head_gain = nullptr; // [n_head]
struct ggml_tensor * attn_act_gain = nullptr; // [1]
struct ggml_tensor * ffn_act_gain = nullptr; // [1]
struct ggml_tensor * embed_skip_scale = nullptr; // [1]
};

struct llama_device {
Expand Down Expand Up @@ -550,6 +556,9 @@ struct llama_model {
struct ggml_tensor * per_layer_model_proj = nullptr;
struct ggml_tensor * per_layer_proj_norm = nullptr;

// talkie - global learnable scalar that multiplies the lm_head matrix.
struct ggml_tensor * lm_head_gain = nullptr; // [1]

std::vector<llama_layer> layers;

//Dense linear projections for SentenceTransformers models like embeddinggemma
Expand Down
Loading