From 66823b0b3b81e3bb8b6c5f6aaf9f0c01cd942b10 Mon Sep 17 00:00:00 2001 From: Lyndon White Date: Fri, 8 Jun 2018 13:19:07 +0800 Subject: [PATCH 1/3] =prootype for loading the binary file --- src/proto.ipynb | 338 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 src/proto.ipynb diff --git a/src/proto.ipynb b/src/proto.ipynb new file mode 100644 index 0000000..71b7ae0 --- /dev/null +++ b/src/proto.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "using PretrainedEmbeddings\n", + "\n", + "using DataDeps" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"FastText fr CommonCrawl Binary/cc.fr.300.bin\"" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dd_name = language_files(PretrainedEmbeddings.FastText_Bin{:fr}) |> first" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "StatStruct(mode=0o100644, size=7238894263)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#=\n", + "struct entry {\n", + " std::string word;\n", + " int64_t count;\n", + " entry_type type;\n", + " std::vector subwords;\n", + "};\n", + " #=" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "#https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1-element Array{String,1}:\n", + " \"cc.fr.300.bin\"" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readdir(datadep\"FastText fr CommonCrawl Binary\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Entry" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@enum EntryType::Int8 word_type=0 label_type=1\n", + "\n", + "struct Entry\n", + " word::String\n", + " count::Int64\n", + " entry_type:: EntryType\n", + " subwords::Vector{Int32}\n", + "end\n", + "Entry()=Entry(\"\", 0, word_type, Int32[])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "magic = read(fh, Int32) = 793712314\n", + "version = read(fh, Int32) = 12\n", + "\n", + "args_dim = read(fh, Int32) = 300\n", + "args_ws = read(fh, Int32) = 5\n", + "args_epoch = read(fh, Int32) = 1\n", + "args_minCount = read(fh, Int32) = 5\n", + "args_neg = read(fh, Int32) = 10\n", + "args_wordNgrams = read(fh, Int32) = 1\n", + "args_loss = read(fh, Int32) = 2\n", + "args_model = read(fh, Int32) = 1\n", + "args_bucket = read(fh, Int32) = 2000000\n", + "args_minn = read(fh, Int32) = 5\n", + "args_maxn = read(fh, Int32) = 5\n", + "args_lrUpdateRate = read(fh, Int32) = 100\n", + "args_t = read(fh, Float64) = 9.999999747378752e-6\n", + "\n", + "size_ = read(fh, Int32) = 2000000\n", + "nwords = read(fh, Int32) = 2000000\n", + "nlabels = read(fh, Int32) = 0\n", + "ntokens = read(fh, Int64) = 68358270953\n", + "pruneidx_size_ = read(fh, Int64) = -1\n", + "\n", + "length(words_) = 2000000\n", + "words_[1] = Entry(\",\", 2854010684, word_type::EntryType = 0, Int32[])\n", + "words_[2] = Entry(\"de\", 2742946523, word_type::EntryType = 0, Int32[])\n", + "words_[3] = Entry(\".\", 1675680641, word_type::EntryType = 0, Int32[])\n", + "words_[end - 1] = Entry(\"Fautereau\", 235, word_type::EntryType = 0, Int32[])\n", + "words_[end] = Entry(\"IdealCoque\", 235, word_type::EntryType = 0, Int32[])\n", + "\n", + "\n", + "quant_input = read(fh, Bool) = false\n", + "m_ = read(fh, Int64) = 4000000\n", + "n_ = read(fh, Int64) = 300\n", + "(typeof(data), size(data)) = (Array{Float32,2}, (4000000, 300))\n", + "quant_output = read(fh, Bool) = false\n", + "m_ = read(fh, Int64) = 2000000\n", + "n_ = read(fh, Int64) = 300\n", + "(typeof(data), size(data)) = (Array{Float32,2}, (2000000, 300))\n" + ] + } + ], + "source": [ + "const FASTTEXT_VERSION = Int32(12); # Version 1b \n", + "const FASTTEXT_FILEFORMAT_MAGIC_INT32 = Int32(793712314);\n", + "\n", + "\n", + "function load_header(fh)\n", + "\t### Check Model\n", + " @show magic = read(fh, Int32)\n", + " @assert magic== FASTTEXT_FILEFORMAT_MAGIC_INT32\n", + " @show version = read(fh, Int32)\n", + " @assert version == FASTTEXT_VERSION\n", + " println()\n", + "end\n", + "\n", + "function load_args(fh)\n", + " ## Load Args https://github.com/facebookresearch/fastText/blob/master/src/args.cc#L261\n", + " @show args_dim = read(fh, Int32)\n", + " @show args_ws = read(fh, Int32)\n", + " @show args_epoch = read(fh, Int32)\n", + " @show args_minCount = read(fh, Int32)\n", + " @show args_neg = read(fh, Int32)\n", + " @show args_wordNgrams = read(fh, Int32)\n", + " @show args_loss = read(fh, Int32)\n", + " @show args_model = read(fh, Int32)\n", + " @show args_bucket = read(fh, Int32)\n", + " @show args_minn = read(fh, Int32)\n", + " @show args_maxn = read(fh, Int32)\n", + " @show args_lrUpdateRate = read(fh, Int32)\n", + " @show args_t = read(fh, Float64)\n", + " println()\n", + "end\n", + "\n", + "function load_dict(fh)\n", + " ## Load model dict, https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L419 \n", + " @show size_ = read(fh, Int32)\n", + " @show nwords = read(fh, Int32)\n", + " @show nlabels = read(fh, Int32)\n", + " @show ntokens = read(fh, Int64)\n", + " @show pruneidx_size_ = read(fh, Int64)\n", + " \n", + " println()\n", + " words_ = map(1:size_) do ii\n", + " e_word=readuntil(fh, '\\0')[1:end-1]\n", + " e_count=read(fh, Int64)\n", + " e_entry_type=read(fh, EntryType)\n", + " Entry(e_word, e_count, e_entry_type, Int32[])\n", + " end\n", + " @show length(words_)\n", + " @show words_[1]\n", + " @show words_[2]\n", + " @show words_[3]\n", + " @show words_[end-1]\n", + " @show words_[end]\n", + " println()\n", + " @assert pruneidx_size_ < 0 \n", + " # Avoid loading this stuff https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L437\n", + " println()\n", + "\t\n", + "\twords_\n", + "end\n", + "\n", + "function load_matrix(fh)\n", + " ### Load Matrix\n", + " #https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc#L114\n", + " \n", + " @show m_ = read(fh, Int64)\n", + " @show n_ = read(fh, Int64)\n", + " data = read(fh, Float32, (m_, n_)) # Note `real` is a typedef for `float32`\n", + " @show typeof(data), size(data)\n", + "\tdata\n", + "end\n", + "\n", + "function load_fasttext_bin(filename)\n", + "\topen(filename) do fh\n", + "\t\tload_header(fh)\n", + "\t\tload_args(fh)\n", + "\t\tload_dict(fh)\n", + "\t\t\n", + "\t\t\n", + "\t\t@show quant_input = read(fh, Bool)\n", + "\t\t@assert !quant_input # avoid that stuff\n", + "\t\tinput_ = load_matrix(fh)\n", + "\t\t\n", + "\t\t@show quant_output = read(fh, Bool)\n", + "\t\t@assert !quant_output # avoid that stuff\n", + "\t\toutput_ = load_matrix(fh)\n", + "\t\t\n", + " @assert(eof(fh))\n", + "\tend\n", + "end\n", + "\n", + "\n", + "load_fasttext_bin(@datadep_str dd_name)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "search: \u001b[1mr\u001b[22m\u001b[1me\u001b[22m\u001b[1ma\u001b[22m\u001b[1md\u001b[22m\u001b[1ms\u001b[22m\u001b[1mt\u001b[22m\u001b[1mr\u001b[22m\u001b[1mi\u001b[22m\u001b[1mn\u001b[22m\u001b[1mg\u001b[22m\n", + "\n" + ] + }, + { + "data": { + "text/markdown": [ + "```\n", + "readstring(stream::IO)\n", + "readstring(filename::AbstractString)\n", + "```\n", + "\n", + "Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n" + ], + "text/plain": [ + "```\n", + "readstring(stream::IO)\n", + "readstring(filename::AbstractString)\n", + "```\n", + "\n", + "Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "?readstring" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 0.6.2", + "language": "julia", + "name": "julia-0.6" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "0.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From c576fa9391e47a63254619b9feb6780e1a0fab18 Mon Sep 17 00:00:00 2001 From: Lyndon White Date: Tue, 9 Oct 2018 15:21:56 +0800 Subject: [PATCH 2/3] =not much --- src/proto.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/proto.ipynb b/src/proto.ipynb index 71b7ae0..3c63a4e 100644 --- a/src/proto.ipynb +++ b/src/proto.ipynb @@ -2,11 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "using PretrainedEmbeddings\n", + "using Embeddings\n", "\n", "using DataDeps" ] @@ -322,7 +322,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Julia 0.6.2", + "display_name": "Julia 0.6.3", "language": "julia", "name": "julia-0.6" }, From b7cb5b37fa587b47df1373d99433af197489bf61 Mon Sep 17 00:00:00 2001 From: Lyndon White Date: Tue, 9 Oct 2018 20:21:24 +0800 Subject: [PATCH 3/3] =test it --- src/fasttext.jl | 623 +++++++++++++++---------------- src/proto.ipynb | 963 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 1132 insertions(+), 454 deletions(-) diff --git a/src/fasttext.jl b/src/fasttext.jl index b3e854a..66fc07b 100644 --- a/src/fasttext.jl +++ b/src/fasttext.jl @@ -98,20 +98,27 @@ function init(::Type{FastText}) end end - for (lang, hashstring) in fast_text_wiki_languages_and_hashes - # TODO Add Binary files as well + for (lang, text_hashstring, bin_hashstring) in fast_wiki_languages_and_hashes push!(language_files(FastText_Text{lang}), "FastText $lang Wiki Text/wiki.$(lang).vec") - register(DataDep("FastText $lang Wiki Text", - """ - Dataset: 300 dimentional FastText Word Embeddings for $lang, trained on Wikipedia - Website: https://fasttext.cc/docs/en/pretrained-vectors.html - Author: Bojanowski et. al. (Facebook) - License: CC-SA 3.0 - Citation: P. Bojanowski*, E. Grave*, A. Joulin, T. Mikolov, Enriching Word Vectors with Subword Information - """, - "https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.$(lang).vec", - hashstring; - )); + push!(language_files(FastText_Bin{lang}), "FastText $lang Wiki Binary/wiki.$(lang).bin") + for (mode, hashstring, ext) in [ + ("Text", text_hashstring, "vec"), + ("Binary", bin_hashstring, "zip") + ] + + register(DataDep("FastText $lang Wiki $mode", + """ + Dataset: 300 dimentional FastText Word Embeddings for $lang, trained on Wikipedia + Website: https://fasttext.cc/docs/en/pretrained-vectors.html + Author: Bojanowski et. al. (Facebook) + License: CC-SA 3.0 + Citation: P. Bojanowski*, E. Grave*, A. Joulin, T. Mikolov, Enriching Word Vectors with Subword Information + """, + "https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.$(lang).$(ext)", + hashstring; + post_fetch_method = mode == "Binary" ? DataDeps.unpack : identity + )); + end end @@ -282,300 +289,300 @@ const fast_commoncrawl_languages_and_hashes = [ -const fast_text_wiki_languages_and_hashes = [ - (:en, "f4d87723baad28804f89c0ecf74fd0f52ac2ae194c270cb4c89b0a84f0bcf53b"), +const fast_wiki_languages_and_hashes = [ # PR Welcome to add Hashes for the ones below - (:ab, nothing), - (:ace, nothing), - (:ady, nothing), - (:aa, nothing), - (:af, nothing), - (:ak, nothing), - (:sq, nothing), - (:als, nothing), - (:am, nothing), - (:ang, nothing), - (:ar, nothing), - (:an, nothing), - (:arc, nothing), - (:hy, nothing), - (:roa_rup, nothing), - (:as, nothing), - (:ast, nothing), - (:av, nothing), - (:ay, nothing), - (:az, nothing), - (:bm, nothing), - (:bjn, nothing), - (:map_bms, nothing), - (:ba, nothing), - (:eu, nothing), - (:bar, nothing), - (:be, nothing), - (:bn, nothing), - (:bh, nothing), - (:bpy, nothing), - (:bi, nothing), - (:bs, nothing), - (:br, nothing), - (:bug, nothing), - (:bg, nothing), - (:my, nothing), - (:bxr, nothing), - (:zh_yue, nothing), - (:ca, nothing), - (:ceb, nothing), - (:bcl, nothing), - (:ch, nothing), - (:cbk_zam, nothing), - (:ce, nothing), - (:chr, nothing), - (:chy, nothing), - (:ny, nothing), - (:zh, nothing), - (:cho, nothing), - (:cv, nothing), - (:zh_classical, nothing), - (:kw, nothing), - (:co, nothing), - (:cr, nothing), - (:crh, nothing), - (:hr, nothing), - (:cs, nothing), - (:da, nothing), - (:dv, nothing), - (:nl, nothing), - (:nds_nl, nothing), - (:dz, nothing), - (:pa, nothing), - (:arz, nothing), - (:eml, nothing), - (:myv, nothing), - (:eo, nothing), - (:et, nothing), - (:ee, nothing), - (:ext, nothing), - (:fo, nothing), - (:hif, nothing), - (:fj, nothing), - (:fi, nothing), - (:frp, nothing), - (:fr, nothing), - (:fur, nothing), - (:ff, nothing), - (:gag, nothing), - (:gl, nothing), - (:gan, nothing), - (:ka, nothing), - (:de, nothing), - (:glk, nothing), - (:gom, nothing), - (:got, nothing), - (:el, nothing), - (:kl, nothing), - (:gn, nothing), - (:gu, nothing), - (:ht, nothing), - (:hak, nothing), - (:ha, nothing), - (:haw, nothing), - (:he, nothing), - (:hz, nothing), - (:mrj, nothing), - (:hi, nothing), - (:ho, nothing), - (:hu, nothing), - (:is, nothing), - (:io, nothing), - (:ig, nothing), - (:ilo, nothing), - (:id, nothing), - (:ia, nothing), - (:ie, nothing), - (:iu, nothing), - (:ik, nothing), - (:ga, nothing), - (:it, nothing), - (:jam, nothing), - (:ja, nothing), - (:jv, nothing), - (:kbd, nothing), - (:kab, nothing), - (:xal, nothing), - (:kn, nothing), - (:kr, nothing), - (:pam, nothing), - (:krc, nothing), - (:kaa, nothing), - (:ks, nothing), - (:csb, nothing), - (:kk, nothing), - (:km, nothing), - (:ki, nothing), - (:rw, nothing), - (:ky, nothing), - (:rn, nothing), - (:kv, nothing), - (:koi, nothing), - (:kg, nothing), - (:ko, nothing), - (:kj, nothing), - (:ku, nothing), - (:ckb, nothing), - (:lad, nothing), - (:lbe, nothing), - (:lo, nothing), - (:ltg, nothing), - (:la, nothing), - (:lv, nothing), - (:lez, nothing), - (:lij, nothing), - (:li, nothing), - (:ln, nothing), - (:lt, nothing), - (:olo, nothing), - (:jbo, nothing), - (:lmo, nothing), - (:nds, nothing), - (:dsb, nothing), - (:lg, nothing), - (:lb, nothing), - (:mk, nothing), - (:mai, nothing), - (:mg, nothing), - (:ms, nothing), - (:ml, nothing), - (:mt, nothing), - (:gv, nothing), - (:mi, nothing), - (:mr, nothing), - (:mh, nothing), - (:mzn, nothing), - (:mhr, nothing), - (:cdo, nothing), - (:zh_min_nan, nothing), - (:min, nothing), - (:xmf, nothing), - (:mwl, nothing), - (:mdf, nothing), - (:mo, nothing), - (:mn, nothing), - (:mus, nothing), - (:nah, nothing), - (:na, nothing), - (:nv, nothing), - (:ng, nothing), - (:nap, nothing), - (:ne, nothing), - (:new, nothing), - (:pih, nothing), - (:nrm, nothing), - (:frr, nothing), - (:lrc, nothing), - (:se, nothing), - (:nso, nothing), - (:no, nothing), - (:nn, nothing), - (:nov, nothing), - (:ii, nothing), - (:oc, nothing), - (:cu, nothing), - (:or, nothing), - (:om, nothing), - (:os, nothing), - (:pfl, nothing), - (:pi, nothing), - (:pag, nothing), - (:pap, nothing), - (:ps, nothing), - (:pdc, nothing), - (:fa, nothing), - (:pcd, nothing), - (:pms, nothing), - (:pl, nothing), - (:pnt, nothing), - (:pt, nothing), - (:qu, nothing), - (:ksh, nothing), - (:rmy, nothing), - (:ro, nothing), - (:rm, nothing), - (:ru, nothing), - (:rue, nothing), - (:sah, nothing), - (:sm, nothing), - (:bat_smg, nothing), - (:sg, nothing), - (:sa, nothing), - (:sc, nothing), - (:stq, nothing), - (:sco, nothing), - (:gd, nothing), - (:sr, nothing), - (:sh, nothing), - (:st, nothing), - (:sn, nothing), - (:scn, nothing), - (:szl, nothing), - (:simple, nothing), - (:sd, nothing), - (:si, nothing), - (:sk, nothing), - (:sl, nothing), - (:so, nothing), - (:azb, nothing), - (:es, nothing), - (:srn, nothing), - (:su, nothing), - (:sw, nothing), - (:ss, nothing), - (:sv, nothing), - (:tl, nothing), - (:ty, nothing), - (:tg, nothing), - (:ta, nothing), - (:roa_tara, nothing), - (:tt, nothing), - (:te, nothing), - (:tet, nothing), - (:th, nothing), - (:bo, nothing), - (:ti, nothing), - (:tpi, nothing), - (:to, nothing), - (:ts, nothing), - (:tn, nothing), - (:tcy, nothing), - (:tum, nothing), - (:tr, nothing), - (:tk, nothing), - (:tyv, nothing), - (:tw, nothing), - (:udm, nothing), - (:uk, nothing), - (:hsb, nothing), - (:ur, nothing), - (:ug, nothing), - (:uz, nothing), - (:ve, nothing), - (:vec, nothing), - (:vep, nothing), - (:vi, nothing), - (:vo, nothing), - (:fiu_vro, nothing), - (:wa, nothing), - (:war, nothing), - (:cy, nothing), - (:vls, nothing), - (:fy, nothing), - (:pnb, nothing), - (:wo, nothing), - (:wuu, nothing), - (:xh, nothing), - (:yi, nothing), - (:yo, nothing), - (:diq, nothing), - (:zea, nothing), - (:za, nothing), - (:zu, nothing), + (:en, "f4d87723baad28804f89c0ecf74fd0f52ac2ae194c270cb4c89b0a84f0bcf53b", nothing), + (:ab, nothing, nothing), + (:ace, nothing, nothing), + (:ady, nothing, nothing), + (:aa, nothing, nothing), + (:af, nothing, nothing), + (:ak, nothing, nothing), + (:sq, nothing, nothing), + (:als, nothing, nothing), + (:am, nothing, nothing), + (:ang, nothing, nothing), + (:ar, nothing, nothing), + (:an, nothing, nothing), + (:arc, nothing, nothing), + (:hy, nothing, nothing), + (:roa_rup, nothing, nothing), + (:as, nothing, nothing), + (:ast, nothing, nothing), + (:av, nothing, nothing), + (:ay, nothing, nothing), + (:az, nothing, nothing), + (:bm, nothing, nothing), + (:bjn, nothing, nothing), + (:map_bms, nothing, nothing), + (:ba, nothing, nothing), + (:eu, nothing, nothing), + (:bar, nothing, nothing), + (:be, nothing, nothing), + (:bn, nothing, nothing), + (:bh, nothing, nothing), + (:bpy, nothing, nothing), + (:bi, nothing, nothing), + (:bs, nothing, nothing), + (:br, nothing, nothing), + (:bug, nothing, nothing), + (:bg, nothing, nothing), + (:my, nothing, nothing), + (:bxr, nothing, nothing), + (:zh_yue, nothing, nothing), + (:ca, nothing, nothing), + (:ceb, nothing, nothing), + (:bcl, nothing, nothing), + (:ch, nothing, nothing), + (:cbk_zam, nothing, nothing), + (:ce, nothing, nothing), + (:chr, nothing, nothing), + (:chy, nothing, nothing), + (:ny, nothing, nothing), + (:zh, nothing, nothing), + (:cho, nothing, nothing), + (:cv, nothing, nothing), + (:zh_classical, nothing, nothing), + (:kw, nothing, nothing), + (:co, nothing, nothing), + (:cr, nothing, nothing), + (:crh, nothing, nothing), + (:hr, nothing, nothing), + (:cs, nothing, nothing), + (:da, nothing, nothing), + (:dv, nothing, nothing), + (:nl, nothing, nothing), + (:nds_nl, nothing, nothing), + (:dz, nothing, nothing), + (:pa, nothing, nothing), + (:arz, nothing, nothing), + (:eml, nothing, nothing), + (:myv, nothing, nothing), + (:eo, nothing, nothing), + (:et, nothing, nothing), + (:ee, nothing, nothing), + (:ext, nothing, nothing), + (:fo, nothing, nothing), + (:hif, nothing, nothing), + (:fj, nothing, nothing), + (:fi, nothing, nothing), + (:frp, nothing, nothing), + (:fr, nothing, nothing), + (:fur, nothing, nothing), + (:ff, nothing, nothing), + (:gag, nothing, nothing), + (:gl, nothing, nothing), + (:gan, nothing, nothing), + (:ka, nothing, nothing), + (:de, nothing, nothing), + (:glk, nothing, nothing), + (:gom, nothing, nothing), + (:got, nothing, nothing), + (:el, nothing, nothing), + (:kl, nothing, nothing), + (:gn, nothing, nothing), + (:gu, nothing, nothing), + (:ht, nothing, nothing), + (:hak, nothing, nothing), + (:ha, nothing, nothing), + (:haw, nothing, nothing), + (:he, nothing, nothing), + (:hz, nothing, nothing), + (:mrj, nothing, nothing), + (:hi, nothing, nothing), + (:ho, nothing, nothing), + (:hu, nothing, nothing), + (:is, nothing, nothing), + (:io, nothing, nothing), + (:ig, nothing, nothing), + (:ilo, nothing, nothing), + (:id, nothing, nothing), + (:ia, nothing, nothing), + (:ie, nothing, nothing), + (:iu, nothing, nothing), + (:ik, nothing, nothing), + (:ga, nothing, nothing), + (:it, nothing, nothing), + (:jam, nothing, nothing), + (:ja, nothing, nothing), + (:jv, nothing, nothing), + (:kbd, nothing, nothing), + (:kab, nothing, nothing), + (:xal, nothing, nothing), + (:kn, nothing, nothing), + (:kr, nothing, nothing), + (:pam, nothing, nothing), + (:krc, nothing, nothing), + (:kaa, nothing, nothing), + (:ks, nothing, nothing), + (:csb, nothing, nothing), + (:kk, nothing, nothing), + (:km, nothing, nothing), + (:ki, nothing, nothing), + (:rw, nothing, nothing), + (:ky, nothing, nothing), + (:rn, nothing, nothing), + (:kv, nothing, nothing), + (:koi, nothing, nothing), + (:kg, nothing, nothing), + (:ko, nothing, nothing), + (:kj, nothing, nothing), + (:ku, nothing, nothing), + (:ckb, nothing, nothing), + (:lad, nothing, nothing), + (:lbe, nothing, nothing), + (:lo, nothing, nothing), + (:ltg, nothing, nothing), + (:la, nothing, nothing), + (:lv, nothing, nothing), + (:lez, nothing, nothing), + (:lij, nothing, nothing), + (:li, nothing, nothing), + (:ln, nothing, nothing), + (:lt, nothing, nothing), + (:olo, nothing, nothing), + (:jbo, nothing, nothing), + (:lmo, nothing, nothing), + (:nds, nothing, nothing), + (:dsb, nothing, nothing), + (:lg, nothing, nothing), + (:lb, nothing, nothing), + (:mk, nothing, nothing), + (:mai, nothing, nothing), + (:mg, nothing, nothing), + (:ms, nothing, nothing), + (:ml, nothing, nothing), + (:mt, nothing, nothing), + (:gv, nothing, nothing), + (:mi, nothing, nothing), + (:mr, nothing, nothing), + (:mh, nothing, nothing), + (:mzn, nothing, nothing), + (:mhr, nothing, nothing), + (:cdo, nothing, nothing), + (:zh_min_nan, nothing, nothing), + (:min, nothing, nothing), + (:xmf, nothing, nothing), + (:mwl, nothing, nothing), + (:mdf, nothing, nothing), + (:mo, nothing, nothing), + (:mn, nothing, nothing), + (:mus, nothing, nothing), + (:nah, nothing, nothing), + (:na, nothing, nothing), + (:nv, nothing, nothing), + (:ng, nothing, nothing), + (:nap, nothing, nothing), + (:ne, nothing, nothing), + (:new, nothing, nothing), + (:pih, nothing, nothing), + (:nrm, nothing, nothing), + (:frr, nothing, nothing), + (:lrc, nothing, nothing), + (:se, nothing, nothing), + (:nso, nothing, nothing), + (:no, nothing, nothing), + (:nn, nothing, nothing), + (:nov, nothing, nothing), + (:ii, nothing, nothing), + (:oc, nothing, nothing), + (:cu, nothing, nothing), + (:or, nothing, nothing), + (:om, nothing, nothing), + (:os, nothing, nothing), + (:pfl, nothing, nothing), + (:pi, nothing, nothing), + (:pag, nothing, nothing), + (:pap, nothing, nothing), + (:ps, nothing, nothing), + (:pdc, nothing, nothing), + (:fa, nothing, nothing), + (:pcd, nothing, nothing), + (:pms, nothing, nothing), + (:pl, nothing, nothing), + (:pnt, nothing, nothing), + (:pt, nothing, nothing), + (:qu, nothing, nothing), + (:ksh, nothing, nothing), + (:rmy, nothing, nothing), + (:ro, nothing, nothing), + (:rm, nothing, nothing), + (:ru, nothing, nothing), + (:rue, nothing, nothing), + (:sah, nothing, nothing), + (:sm, nothing, nothing), + (:bat_smg, nothing, nothing), + (:sg, nothing, nothing), + (:sa, nothing, nothing), + (:sc, nothing, nothing), + (:stq, nothing, nothing), + (:sco, nothing, nothing), + (:gd, nothing, nothing), + (:sr, nothing, nothing), + (:sh, nothing, nothing), + (:st, nothing, nothing), + (:sn, nothing, nothing), + (:scn, nothing, nothing), + (:szl, nothing, nothing), + (:simple, nothing, nothing), + (:sd, nothing, nothing), + (:si, nothing, nothing), + (:sk, nothing, nothing), + (:sl, nothing, nothing), + (:so, nothing, nothing), + (:azb, nothing, nothing), + (:es, nothing, nothing), + (:srn, nothing, nothing), + (:su, nothing, nothing), + (:sw, nothing, nothing), + (:ss, nothing, nothing), + (:sv, nothing, nothing), + (:tl, nothing, nothing), + (:ty, nothing, nothing), + (:tg, nothing, nothing), + (:ta, nothing, nothing), + (:roa_tara, nothing, nothing), + (:tt, nothing, nothing), + (:te, nothing, nothing), + (:tet, nothing, nothing), + (:th, nothing, nothing), + (:bo, nothing, nothing), + (:ti, nothing, nothing), + (:tpi, nothing, nothing), + (:to, nothing, nothing), + (:ts, nothing, nothing), + (:tn, nothing, nothing), + (:tcy, nothing, nothing), + (:tum, nothing, nothing), + (:tr, nothing, nothing), + (:tk, nothing, nothing), + (:tyv, nothing, nothing), + (:tw, nothing, nothing), + (:udm, nothing, nothing), + (:uk, nothing, nothing), + (:hsb, nothing, nothing), + (:ur, nothing, nothing), + (:ug, nothing, nothing), + (:uz, nothing, nothing), + (:ve, nothing, nothing), + (:vec, nothing, nothing), + (:vep, nothing, nothing), + (:vi, nothing, nothing), + (:vo, nothing, nothing), + (:fiu_vro, nothing, nothing), + (:wa, nothing, nothing), + (:war, nothing, nothing), + (:cy, nothing, nothing), + (:vls, nothing, nothing), + (:fy, nothing, nothing), + (:pnb, nothing, nothing), + (:wo, nothing, nothing), + (:wuu, nothing, nothing), + (:xh, nothing, nothing), + (:yi, nothing, nothing), + (:yo, nothing, nothing), + (:diq, nothing, nothing), + (:zea, nothing, nothing), + (:za, nothing, nothing), + (:zu, nothing, nothing), ] diff --git a/src/proto.ipynb b/src/proto.ipynb index 3c63a4e..8b737b3 100644 --- a/src/proto.ipynb +++ b/src/proto.ipynb @@ -2,74 +2,48 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "using Embeddings\n", - "\n", - "using DataDeps" + "using Pkg\n", + "pkg\"activate ../../..\"\n" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 2, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"FastText fr CommonCrawl Binary/cc.fr.300.bin\"" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "dd_name = language_files(PretrainedEmbeddings.FastText_Bin{:fr}) |> first" + "using Embeddings\n", + "\n", + "using DataDeps" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "StatStruct(mode=0o100644, size=7238894263)" + "\"FastText fr Wiki Binary/wiki.fr.bin\"" ] }, - "execution_count": 22, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "stat" + "dd_name = language_files(Embeddings.FastText_Bin{:fr}) |> last" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#=\n", - "struct entry {\n", - " std::string word;\n", - " int64_t count;\n", - " entry_type type;\n", - " std::vector subwords;\n", - "};\n", - " #=" - ] - }, - { - "cell_type": "code", - "execution_count": 24, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -78,28 +52,27 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1-element Array{String,1}:\n", - " \"cc.fr.300.bin\"" + "StatStruct(mode=0o100644, size=8493673445)" ] }, - "execution_count": 20, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "readdir(datadep\"FastText fr CommonCrawl Binary\")" + "stat(datadep\"FastText en Wiki Binary/wiki.en.bin\")" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -108,7 +81,7 @@ "Entry" ] }, - "execution_count": 33, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -127,53 +100,29 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "magic = read(fh, Int32) = 793712314\n", - "version = read(fh, Int32) = 12\n", "\n", - "args_dim = read(fh, Int32) = 300\n", - "args_ws = read(fh, Int32) = 5\n", - "args_epoch = read(fh, Int32) = 1\n", - "args_minCount = read(fh, Int32) = 5\n", - "args_neg = read(fh, Int32) = 10\n", - "args_wordNgrams = read(fh, Int32) = 1\n", - "args_loss = read(fh, Int32) = 2\n", - "args_model = read(fh, Int32) = 1\n", - "args_bucket = read(fh, Int32) = 2000000\n", - "args_minn = read(fh, Int32) = 5\n", - "args_maxn = read(fh, Int32) = 5\n", - "args_lrUpdateRate = read(fh, Int32) = 100\n", - "args_t = read(fh, Float64) = 9.999999747378752e-6\n", "\n", - "size_ = read(fh, Int32) = 2000000\n", - "nwords = read(fh, Int32) = 2000000\n", - "nlabels = read(fh, Int32) = 0\n", - "ntokens = read(fh, Int64) = 68358270953\n", - "pruneidx_size_ = read(fh, Int64) = -1\n", - "\n", - "length(words_) = 2000000\n", - "words_[1] = Entry(\",\", 2854010684, word_type::EntryType = 0, Int32[])\n", - "words_[2] = Entry(\"de\", 2742946523, word_type::EntryType = 0, Int32[])\n", - "words_[3] = Entry(\".\", 1675680641, word_type::EntryType = 0, Int32[])\n", - "words_[end - 1] = Entry(\"Fautereau\", 235, word_type::EntryType = 0, Int32[])\n", - "words_[end] = Entry(\"IdealCoque\", 235, word_type::EntryType = 0, Int32[])\n", - "\n", - "\n", - "quant_input = read(fh, Bool) = false\n", - "m_ = read(fh, Int64) = 4000000\n", - "n_ = read(fh, Int64) = 300\n", - "(typeof(data), size(data)) = (Array{Float32,2}, (4000000, 300))\n", - "quant_output = read(fh, Bool) = false\n", - "m_ = read(fh, Int64) = 2000000\n", - "n_ = read(fh, Int64) = 300\n", - "(typeof(data), size(data)) = (Array{Float32,2}, (2000000, 300))\n" + "size(dict_) = (2519370,)\n", + "size(input_) = (4519370, 300)\n", + "size(output_) = (2519370, 300)\n" ] + }, + { + "data": { + "text/plain": [ + "(2519370, 300)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -183,133 +132,855 @@ "\n", "function load_header(fh)\n", "\t### Check Model\n", - " @show magic = read(fh, Int32)\n", + " magic = read(fh, Int32)\n", " @assert magic== FASTTEXT_FILEFORMAT_MAGIC_INT32\n", - " @show version = read(fh, Int32)\n", - " @assert version == FASTTEXT_VERSION\n", + " version = read(fh, Int32)\n", + " version == FASTTEXT_VERSION || @debug \"Unexpected FastText Version\" expected=FASTTEXT_VERSION actual=version\n", " println()\n", "end\n", "\n", "function load_args(fh)\n", " ## Load Args https://github.com/facebookresearch/fastText/blob/master/src/args.cc#L261\n", - " @show args_dim = read(fh, Int32)\n", - " @show args_ws = read(fh, Int32)\n", - " @show args_epoch = read(fh, Int32)\n", - " @show args_minCount = read(fh, Int32)\n", - " @show args_neg = read(fh, Int32)\n", - " @show args_wordNgrams = read(fh, Int32)\n", - " @show args_loss = read(fh, Int32)\n", - " @show args_model = read(fh, Int32)\n", - " @show args_bucket = read(fh, Int32)\n", - " @show args_minn = read(fh, Int32)\n", - " @show args_maxn = read(fh, Int32)\n", - " @show args_lrUpdateRate = read(fh, Int32)\n", - " @show args_t = read(fh, Float64)\n", + " args_dim = read(fh, Int32)\n", + " args_ws = read(fh, Int32)\n", + " args_epoch = read(fh, Int32)\n", + " args_minCount = read(fh, Int32)\n", + " args_neg = read(fh, Int32)\n", + " args_wordNgrams = read(fh, Int32)\n", + " args_loss = read(fh, Int32)\n", + " args_model = read(fh, Int32)\n", + " args_bucket = read(fh, Int32)\n", + " args_minn = read(fh, Int32)\n", + " args_maxn = read(fh, Int32)\n", + " args_lrUpdateRate = read(fh, Int32)\n", + " args_t = read(fh, Float64)\n", " println()\n", "end\n", "\n", - "function load_dict(fh)\n", + "function load_fastext_dict(fh)\n", " ## Load model dict, https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L419 \n", - " @show size_ = read(fh, Int32)\n", - " @show nwords = read(fh, Int32)\n", - " @show nlabels = read(fh, Int32)\n", - " @show ntokens = read(fh, Int64)\n", - " @show pruneidx_size_ = read(fh, Int64)\n", + " size_ = read(fh, Int32)\n", + " nwords = read(fh, Int32)\n", + " nlabels = read(fh, Int32)\n", + " ntokens = read(fh, Int64)\n", + " pruneidx_size_ = read(fh, Int64)\n", " \n", - " println()\n", " words_ = map(1:size_) do ii\n", - " e_word=readuntil(fh, '\\0')[1:end-1]\n", + " e_word=readuntil(fh, '\\0')\n", " e_count=read(fh, Int64)\n", " e_entry_type=read(fh, EntryType)\n", - " Entry(e_word, e_count, e_entry_type, Int32[])\n", + " Entry(e_word, e_count, e_entry_type, Int32[ii]) # Assume no subwords, just self.\n", " end\n", - " @show length(words_)\n", - " @show words_[1]\n", - " @show words_[2]\n", - " @show words_[3]\n", - " @show words_[end-1]\n", - " @show words_[end]\n", - " println()\n", " @assert pruneidx_size_ < 0 \n", " # Avoid loading this stuff https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L437\n", - " println()\n", - "\t\n", - "\twords_\n", + " words_\n", "end\n", "\n", "function load_matrix(fh)\n", " ### Load Matrix\n", " #https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc#L114\n", " \n", - " @show m_ = read(fh, Int64)\n", - " @show n_ = read(fh, Int64)\n", - " data = read(fh, Float32, (m_, n_)) # Note `real` is a typedef for `float32`\n", - " @show typeof(data), size(data)\n", - "\tdata\n", + " m_ = read(fh, Int64)\n", + " n_ = read(fh, Int64)\n", + " data = read!(fh, Array{Float32}(undef, (m_, n_))) # Note `real` is a typedef for `float32`\n", + " data\n", "end\n", "\n", "function load_fasttext_bin(filename)\n", + " local dict_, input_, output_\n", "\topen(filename) do fh\n", - "\t\tload_header(fh)\n", - "\t\tload_args(fh)\n", - "\t\tload_dict(fh)\n", - "\t\t\n", + "\t\tload_header(fh) #Discard them\n", + "\t\tload_args(fh) # Discared them\n", + "\t\tdict_ = load_fastext_dict(fh)\n", "\t\t\n", - "\t\t@show quant_input = read(fh, Bool)\n", + "\t\tquant_input = read(fh, Bool)\n", "\t\t@assert !quant_input # avoid that stuff\n", "\t\tinput_ = load_matrix(fh)\n", "\t\t\n", - "\t\t@show quant_output = read(fh, Bool)\n", + "\t\tquant_output = read(fh, Bool)\n", "\t\t@assert !quant_output # avoid that stuff\n", "\t\toutput_ = load_matrix(fh)\n", "\t\t\n", " @assert(eof(fh))\n", "\tend\n", + " dict_, input_, output_\n", "end\n", "\n", "\n", - "load_fasttext_bin(@datadep_str dd_name)\n" + "dict_, input_, output_ = load_fasttext_bin(@datadep_str \"FastText en Wiki Binary/wiki.en.bin\")\n", + "@show size(dict_)\n", + "@show size(input_)\n", + "@show size(output_)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "using Embeddings: FastText_Bin" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "function Embeddings._load_embeddings(::Type{<:FastText_Bin}, embedding_file, max_vocab_size=Inf, keep_words=[])\n", + " isempty(keep_words) || ArgumentError(\"keep_words argument is not supported by FastText_Bin all words are kept\") \n", + " # TODO: If ever implement memory mapping then could use keep_words in that potentitally\n", + "\n", + " \n", + " local dict_, input_\n", + "\topen(embedding_file) do fh\n", + "\t\tload_header(fh) #Discard them\n", + "\t\tload_args(fh) # Discared them\n", + "\t\tdict_ = load_fastext_dict(fh)\n", + "\t\t\n", + "\t\tquant_input = read(fh, Bool)\n", + "\t\t@assert !quant_input # avoid that stuff\n", + "\t\tinput_ = load_matrix(fh)\n", + " end\n", + " \n", + " max_vocab_size = min(max_vocab_size, length(dict_))\n", + " (@view input_[1:max_vocab_size, :])' , [entry.word for entry in dict_]\n", + "end" + ] + }, + { + "cell_type": "code", + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "search: \u001b[1mr\u001b[22m\u001b[1me\u001b[22m\u001b[1ma\u001b[22m\u001b[1md\u001b[22m\u001b[1ms\u001b[22m\u001b[1mt\u001b[22m\u001b[1mr\u001b[22m\u001b[1mi\u001b[22m\u001b[1mn\u001b[22m\u001b[1mg\u001b[22m\n", + "\n", "\n" ] }, + { + "ename": "InterruptException", + "evalue": "InterruptException:", + "output_type": "error", + "traceback": [ + "InterruptException:", + "", + "Stacktrace:", + " [1] Type at ./boot.jl:394 [inlined]", + " [2] getindex at ./array.jl:366 [inlined]", + " [3] (::getfield(Main, Symbol(\"##15#16\")){IOStream})(::Int64) at ./In[14]:44", + " [4] iterate at ./generator.jl:47 [inlined]", + " [5] collect_to! at ./array.jl:656 [inlined]", + " [6] collect_to_with_first!(::Array{Entry,1}, ::Entry, ::Base.Generator{UnitRange{Int64},getfield(Main, Symbol(\"##15#16\")){IOStream}}, ::Int64) at ./array.jl:643", + " [7] _collect(::UnitRange{Int64}, ::Base.Generator{UnitRange{Int64},getfield(Main, Symbol(\"##15#16\")){IOStream}}, ::Base.EltypeUnknown, ::Base.HasShape{1}) at ./array.jl:637", + " [8] collect_similar at ./array.jl:561 [inlined]", + " [9] map at ./abstractarray.jl:1995 [inlined]", + " [10] load_fastext_dict(::IOStream) at ./In[14]:40", + " [11] (::getfield(Main, Symbol(\"##47#49\")))(::IOStream) at ./In[35]:10", + " [12] #open#298(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::getfield(Main, Symbol(\"##47#49\")), ::String) at ./iostream.jl:369", + " [13] open at ./iostream.jl:367 [inlined]", + " [14] _load_embeddings(::Type{FastText_Bin}, ::String, ::Int64, ::Set{Any}) at ./In[35]:7", + " [15] #load_embeddings#12(::Int64, ::Set{Any}, ::Function, ::Type{FastText_Bin}, ::String) at /home/wheel/oxinabox/.julia/environments/EmbeddingsFastText/dev/Embeddings/src/Embeddings.jl:99", + " [16] (::getfield(Embeddings, Symbol(\"#kw##load_embeddings\")))(::NamedTuple{(:max_vocab_size, :keep_words),Tuple{Int64,Set{Any}}}, ::typeof(load_embeddings), ::Type{FastText_Bin}, ::String) at ./none:0", + " [17] #load_embeddings#11(::Int64, ::Set{Any}, ::Function, ::Type{FastText_Bin}, ::Int64) at /home/wheel/oxinabox/.julia/environments/EmbeddingsFastText/dev/Embeddings/src/Embeddings.jl:91", + " [18] load_embeddings at /home/wheel/oxinabox/.julia/environments/EmbeddingsFastText/dev/Embeddings/src/Embeddings.jl:90 [inlined] (repeats 2 times)", + " [19] top-level scope at In[38]:2" + ] + } + ], + "source": [ + "using MLDataUtils\n", + "\n", + "o_embedding_table = load_embeddings(FastText_Bin) \n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "to_ind (generic function with 2 methods)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_ind(lbl, enc=LabelEnc.NativeLabels(o_embedding_table.vocab)) = convertlabel(LabelEnc.Indices, lbl , enc)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "foods = split(\"turkey chicken duck apple banana cheese sausage milk egg\")\n", + "sports = split(\"cricket golf baseball football soccer rugby run walk swim dive\")\n", + "colors = split(\"orange yellow blue green red\")\n", + "tools = split(\"tape glue nails hammer saw drill\")\n", + "objects = split(\"phone car truck record shed house castle rook\")\n", + "other = split(\"down up danger risk reward new old fresh stale glass stone china wood face\");\n", + "\n", + "words_by_class = [foods,sports,colors,tools,objects,other]\n", + "all_words = reduce(vcat, words_by_class)\n", + "\n", + "embeddings = o_embedding_table.embeddings[:,to_ind(all_words)]\n", + "\n", + "classes = map(all_words) do word\n", + " findfirst(col -> word ∈ col, words_by_class)\n", + "end;" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32mComputing t-SNE 2%|█ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 5%|██ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 6%|██ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 9%|███ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 11%|████ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 13%|█████ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 16%|██████ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 18%|██████ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 20%|███████ | ETA: 0:00:04\u001b[39m\n", + "\u001b[32mComputing t-SNE 23%|████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 25%|█████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 27%|██████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 29%|███████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 32%|████████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 34%|████████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 36%|█████████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 38%|██████████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 40%|███████████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 42%|███████████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 45%|████████████████ | ETA: 0:00:03\u001b[39m\n", + "\u001b[32mComputing t-SNE 47%|█████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 49%|██████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 51%|███████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 54%|███████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 56%|████████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 58%|█████████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 60%|██████████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 63%|███████████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 65%|███████████████████████ | ETA: 0:00:02\u001b[39m\n", + "\u001b[32mComputing t-SNE 67%|████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 70%|█████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 72%|██████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 74%|███████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 76%|███████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 79%|████████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 81%|█████████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 83%|██████████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 86%|███████████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 88%|████████████████████████████████ | ETA: 0:00:01\u001b[39m\n", + "\u001b[32mComputing t-SNE 90%|████████████████████████████████ | ETA: 0:00:00\u001b[39m\n", + "\u001b[32mComputing t-SNE 92%|█████████████████████████████████ | ETA: 0:00:00\u001b[39m\n", + "\u001b[32mComputing t-SNE 95%|██████████████████████████████████ | ETA: 0:00:00\u001b[39m\n", + "\u001b[32mComputing t-SNE 97%|███████████████████████████████████ | ETA: 0:00:00\u001b[39m\n", + "\u001b[32mComputing t-SNE 99%|████████████████████████████████████| ETA: 0:00:00\u001b[39m\n", + "\u001b[32mComputing t-SNE100%|████████████████████████████████████| Time: 0:00:05\u001b[39m\n", + "\u001b[34m KL_divergence: 0.6509\u001b[39m\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "-100\n", + "\n", + "\n", + "-50\n", + "\n", + "\n", + "0\n", + "\n", + "\n", + "50\n", + "\n", + "\n", + "100\n", + "\n", + "\n", + "-100\n", + "\n", + "\n", + "-50\n", + "\n", + "\n", + "0\n", + "\n", + "\n", + "50\n", + "\n", + "\n", + "100\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "turkey\n", + "\n", + "\n", + "chicken\n", + "\n", + "\n", + "duck\n", + "\n", + "\n", + "apple\n", + "\n", + "\n", + "banana\n", + "\n", + "\n", + "cheese\n", + "\n", + "\n", + "sausage\n", + "\n", + "\n", + "milk\n", + "\n", + "\n", + "egg\n", + "\n", + "\n", + "cricket\n", + "\n", + "\n", + "golf\n", + "\n", + "\n", + "baseball\n", + "\n", + "\n", + "football\n", + "\n", + "\n", + "soccer\n", + "\n", + "\n", + "rugby\n", + "\n", + "\n", + "run\n", + "\n", + "\n", + "walk\n", + "\n", + "\n", + "swim\n", + "\n", + "\n", + "dive\n", + "\n", + "\n", + "orange\n", + "\n", + "\n", + "yellow\n", + "\n", + "\n", + "blue\n", + "\n", + "\n", + "green\n", + "\n", + "\n", + "red\n", + "\n", + "\n", + "tape\n", + "\n", + "\n", + "glue\n", + "\n", + "\n", + "nails\n", + "\n", + "\n", + "hammer\n", + "\n", + "\n", + "saw\n", + "\n", + "\n", + "drill\n", + "\n", + "\n", + "phone\n", + "\n", + "\n", + "car\n", + "\n", + "\n", + "truck\n", + "\n", + "\n", + "record\n", + "\n", + "\n", + "shed\n", + "\n", + "\n", + "house\n", + "\n", + "\n", + "castle\n", + "\n", + "\n", + "rook\n", + "\n", + "\n", + "down\n", + "\n", + "\n", + "up\n", + "\n", + "\n", + "danger\n", + "\n", + "\n", + "risk\n", + "\n", + "\n", + "reward\n", + "\n", + "\n", + "new\n", + "\n", + "\n", + "old\n", + "\n", + "\n", + "fresh\n", + "\n", + "\n", + "stale\n", + "\n", + "\n", + "glass\n", + "\n", + "\n", + "stone\n", + "\n", + "\n", + "china\n", + "\n", + "\n", + "wood\n", + "\n", + "\n", + "face\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "1\n", + "\n", + "\n", + "2\n", + "\n", + "\n", + "3\n", + "\n", + "\n", + "4\n", + "\n", + "\n", + "5\n", + "\n", + "\n", + "6\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "y1\n", + "\n", + "\n" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "using Plots\n", + "using TSne\n", + "xs = tsne(embeddings', 2, 500, 2000, 20.0)'\n", + "scatter(xs[1,:], xs[2,:]; series_annotations=all_words, zcolor=classes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# get_word_embedding\n", + "```C\n", + "void FastText::getWordVector(Vector& vec, const std::string& word) const {\n", + " const std::vector& ngrams = dict_->getSubwords(word);\n", + " vec.zero();\n", + " for (int i = 0; i < ngrams.size(); i ++) {\n", + " addInputVector(vec, ngrams[i]); \n", + " }\n", + " if (ngrams.size() > 0) {\n", + " vec.mul(1.0 / ngrams.size());\n", + " }\n", + "}```" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "52-element Array{Float32,1}:\n", + " -0.08835874 \n", + " 0.028605528\n", + " 0.4468927 \n", + " -0.1664481 \n", + " 0.038865376\n", + " 0.012204368\n", + " 0.047387525\n", + " 0.41482207 \n", + " 0.07806629 \n", + " 0.090056784\n", + " -1.2282208 \n", + " 0.04789894 \n", + " -0.45578927 \n", + " ⋮ \n", + " -0.04354975 \n", + " 0.36804572 \n", + " 0.64148843 \n", + " 0.22869174 \n", + " -0.9106075 \n", + " 0.07064346 \n", + " -0.92673564 \n", + " -0.68822235 \n", + " -0.09732055 \n", + " 0.13649319 \n", + " -0.11606625 \n", + " -0.72149366 " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ { "data": { "text/markdown": [ + "No documentation found.\n", + "\n", + "`Embeddings.EmbeddingTable` is of type `UnionAll`.\n", + "\n", + "# Summary\n", + "\n", "```\n", - "readstring(stream::IO)\n", - "readstring(filename::AbstractString)\n", + "struct UnionAll <: Type{T}\n", "```\n", "\n", - "Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n" - ], - "text/plain": [ + "# Fields\n", + "\n", + "```\n", + "var :: TypeVar\n", + "body :: Any\n", "```\n", - "readstring(stream::IO)\n", - "readstring(filename::AbstractString)\n", + "\n", + "# Supertype Hierarchy\n", + "\n", "```\n", + "UnionAll <: Type{T} <: Any\n", + "```\n" + ], + "text/plain": [ + " No documentation found.\n", + "\n", + " \u001b[36mEmbeddings.EmbeddingTable\u001b[39m is of type \u001b[36mUnionAll\u001b[39m.\n", + "\n", + "\u001b[1m Summary\u001b[22m\n", + "\u001b[1m ≡≡≡≡≡≡≡≡≡\u001b[22m\n", + "\n", + "\u001b[36m struct UnionAll <: Type{T}\u001b[39m\n", + "\n", + "\u001b[1m Fields\u001b[22m\n", + "\u001b[1m ≡≡≡≡≡≡≡≡\u001b[22m\n", + "\n", + "\u001b[36m var :: TypeVar\u001b[39m\n", + "\u001b[36m body :: Any\u001b[39m\n", + "\n", + "\u001b[1m Supertype Hierarchy\u001b[22m\n", + "\u001b[1m ≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡≡\u001b[22m\n", "\n", - "Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n" + "\u001b[36m UnionAll <: Type{T} <: Any\u001b[39m" ] }, - "execution_count": 42, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "?readstring" + "?Embeddings.EmbeddingTable" ] }, { @@ -322,15 +993,15 @@ ], "metadata": { "kernelspec": { - "display_name": "Julia 0.6.3", + "display_name": "Julia 0.7.0", "language": "julia", - "name": "julia-0.6" + "name": "julia-0.7" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", - "version": "0.6.2" + "version": "0.7.0" } }, "nbformat": 4,