Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.jl.cov
*.jl.*.cov
*.jl.mem
Manifest.toml
167 changes: 167 additions & 0 deletions Manifest.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# This file is machine-generated - editing it directly is not advised

[[Artifacts]]
deps = ["Pkg"]
git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744"
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
version = "1.3.0"

[[Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"

[[BinaryProvider]]
deps = ["Libdl", "Logging", "SHA"]
git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
version = "0.5.10"

[[DataDeps]]
deps = ["BinaryProvider", "HTTP", "Libdl", "Reexport", "SHA", "p7zip_jll"]
git-tree-sha1 = "4f0e41ff461d42cfc62ff0de4f1cd44c6e6b3771"
uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
version = "0.7.7"

[[Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[Distributed]]
deps = ["Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"

[[HTML_Entities]]
deps = ["StrTables"]
git-tree-sha1 = "aa19515d6ebe7f91a39cfc1dc6341f38fcac1282"
uuid = "7693890a-d069-55fe-a829-b4a6d304f0ee"
version = "1.0.0"

[[HTTP]]
deps = ["Base64", "Dates", "IniFile", "MbedTLS", "NetworkOptions", "Sockets", "URIs"]
git-tree-sha1 = "c9f380c76d8aaa1fa7ea9cf97bddbc0d5b15adc2"
uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
version = "0.9.5"

[[IniFile]]
deps = ["Test"]
git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8"
uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
version = "0.5.0"

[[InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"

[[InternedStrings]]
deps = ["Random", "Test"]
git-tree-sha1 = "eb05b5625bc5d821b8075a77e4c421933e20c76b"
uuid = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
version = "0.7.0"

[[JLLWrappers]]
git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.2.0"

[[JSON]]
deps = ["Dates", "Mmap", "Parsers", "Unicode"]
git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4"
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
version = "0.21.1"

[[LibGit2]]
deps = ["Printf"]
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"

[[Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[Markdown]]
deps = ["Base64"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[MbedTLS]]
deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"]
git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe"
uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
version = "1.0.3"

[[MbedTLS_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6"
uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
version = "2.16.8+1"

[[Mmap]]
uuid = "a63ad114-7e13-5084-954f-fe012c677804"

[[NetworkOptions]]
git-tree-sha1 = "ed3157f48a05543cce9b241e1f2815f7e843d96e"
uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
version = "1.2.0"

[[Parsers]]
deps = ["Dates"]
git-tree-sha1 = "223a825cccef2228f3fdbf2ecc7ca93363059073"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "1.0.16"

[[Pkg]]
deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

[[Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"

[[REPL]]
deps = ["InteractiveUtils", "Markdown", "Sockets"]
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"

[[Random]]
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[Reexport]]
git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "1.0.0"

[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"

[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

[[Sockets]]
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"

[[StrTables]]
deps = ["Dates"]
git-tree-sha1 = "5998faae8c6308acc25c25896562a1e66a3bb038"
uuid = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
version = "1.0.1"

[[Test]]
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[URIs]]
git-tree-sha1 = "7855809b88d7b16e9b029afd17880930626f54a2"
uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
version = "1.2.0"

[[UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[p7zip_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
git-tree-sha1 = "ee65cfa19bea645698a0224bfa216f2b1c8b559f"
uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
version = "16.2.0+3"
Comment thread
shikhargoswami marked this conversation as resolved.
Outdated
8 changes: 6 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,18 @@ version = "0.5.6"
[deps]
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee"
InternedStrings = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Comment thread
shikhargoswami marked this conversation as resolved.
StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[compat]
DataDeps = "0.6.5, 0.7"
julia = "1"
HTML_Entities= "1"
HTML_Entities = "1"
StrTables = "1"
julia = "1"
JSON = "0.21.1"
InternedStrings = "0.7.0"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
14 changes: 12 additions & 2 deletions src/WordTokenizers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ module WordTokenizers
using HTML_Entities
using StrTables
using Unicode
using DataDeps
using DataDeps, JSON, InternedStrings

abstract type PretrainedTokenizer end

Expand All @@ -17,7 +17,9 @@ export poormans_tokenize, punctuation_space_tokenize,
set_tokenizer, set_sentence_splitter,
rev_tokenize, rev_detokenize,
toktok_tokenize
export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens

export ALBERT_V1, ALBERT_V2, GPT2
export load, tokenizer, sentence_from_tokens, ids_from_tokens, tokenize, sentence_from_tokens_gpt2
export PretrainedTokenizer, tokenizer_files
include("words/fast.jl")

Expand All @@ -33,6 +35,7 @@ include("set_method_api.jl")
include("split_api.jl")

include("statistical/unigram.jl")
include("statistical/gpt2tokenizer.jl")

const pretrained = Dict{DataType, Vector{String}}()
function tokenizer_files(::Type{T}) where T<:PretrainedTokenizer
Expand All @@ -47,4 +50,11 @@ function __init__()
init_vocab_datadeps()
end

load(::Val{:ALBERT_V1}) = load_sp(ALBERT_V1)
load(::Val{:ALBERT_V2}) = load_sp(ALBERT_V2)
load(::Val{:GPT2}) = load_gpt2(GPT2)

load(::Type{T}) where T<:PretrainedTokenizer = load(Val(Symbol(T)))


end # module
17 changes: 16 additions & 1 deletion src/statistical/Vocab_DataDeps.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
abstract type ALBERT_V1 <: PretrainedTokenizer end
abstract type ALBERT_V2 <: PretrainedTokenizer end
abstract type GPT2 <: PretrainedTokenizer end

const vectors_albertversion1 = [
("albert_base_v1_30k-clean.vocab",
Expand Down Expand Up @@ -40,6 +41,8 @@ const vectors_albertversion2 = [
"https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v2_30k-clean.vocab")
]

const vectors_gpt2 = ["encoder.json", "vocab.bpe"]

function init_vocab_datadeps()
for (depname, description, sha, link) in vectors_albertversion1
register(DataDep(depname,
Expand Down Expand Up @@ -70,5 +73,17 @@ function init_vocab_datadeps()
))
append!(tokenizer_files(ALBERT_V2), ["$depname"])
end
end

register(DataDep("GPT2",
"""
Pretrained gpt2 vocabulary and merges file by Open AI.
Website: https://openai.com/blog/better-language-models/
Author: Radford et al
Licence: MIT
All GPT2 Models are trained on same size vocabulary.
""",
["https://openaipublic.blob.core.windows.net/gpt-2/models/117M/$(file)" for file in vectors_gpt2],
"05805f21f823300551adf0646abe905eb036fb272f97c279f0d9c656c845ca46"))

append!(tokenizer_files(GPT2), ["GPT2/$(file)" for file in vectors_gpt2])
end
Loading