Skip to content

Commit a6362a6

Browse files
authored
Add support for store pinyin/table code in user history (#119)
1 parent 0884c21 commit a6362a6

15 files changed

Lines changed: 772 additions & 225 deletions

src/libime/core/historybigram.cpp

Lines changed: 282 additions & 120 deletions
Large diffs are not rendered by default.

src/libime/core/historybigram.h

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@
77
#define _FCITX_LIBIME_CORE_HISTORYBIGRAM_H_
88

99
#include <cstddef>
10+
#include <cstdint>
11+
#include <functional>
1012
#include <istream>
1113
#include <memory>
1214
#include <ostream>
1315
#include <string>
1416
#include <string_view>
1517
#include <unordered_set>
18+
#include <utility>
1619
#include <vector>
1720
#include <fcitx-utils/macros.h>
1821
#include <libime/core/lattice.h>
@@ -22,8 +25,13 @@ namespace libime {
2225

2326
class HistoryBigramPrivate;
2427

28+
using ValidationCodeExtractor = std::function<std::string(const WordNode *)>;
29+
2530
class LIBIMECORE_EXPORT HistoryBigram {
2631
public:
32+
using WordWithCode = std::pair<std::string, std::string>;
33+
using WordWithCodeView = std::pair<std::string_view, std::string_view>;
34+
2735
HistoryBigram();
2836

2937
FCITX_DECLARE_VIRTUAL_DTOR_MOVE(HistoryBigram);
@@ -43,14 +51,20 @@ class LIBIMECORE_EXPORT HistoryBigram {
4351
bool useOnlyUnigram() const;
4452

4553
void forget(std::string_view word);
54+
void forget(std::string_view word, std::string_view code);
4655

4756
bool isUnknown(std::string_view v) const;
48-
float score(const WordNode *prev, const WordNode *cur) const {
49-
return score(prev ? prev->word() : "", cur ? cur->word() : "");
50-
}
57+
float score(const WordNode *prev, const WordNode *cur) const;
5158
float score(std::string_view prev, std::string_view cur) const;
59+
float scoreWithCode(WordWithCodeView prev, WordWithCodeView cur) const;
60+
float scoreWithCode(const WordNode *prev, const WordNode *cur,
61+
const ValidationCodeExtractor &extractor) const;
5262
void add(const SentenceResult &sentence);
5363
void add(const std::vector<std::string> &sentence);
64+
void addWithCode(const SentenceResult &sentence,
65+
const ValidationCodeExtractor &validationCodeExtractor);
66+
void
67+
addWithCode(const std::vector<WordWithCode> &sentenceWithValidationCode);
5468

5569
/// Fill the prediction based on current sentence.
5670
void fillPredict(std::unordered_set<std::string> &words,
@@ -59,6 +73,35 @@ class LIBIMECORE_EXPORT HistoryBigram {
5973

6074
bool containsBigram(std::string_view prev, std::string_view cur) const;
6175

76+
/**
77+
* Query the weighted frequency of the unigram.
78+
*
79+
* @since 1.1.14
80+
*/
81+
float unigramFrequency(WordWithCodeView word) const;
82+
83+
/**
84+
* Query the weighted frequency of the bigram.
85+
*
86+
* @since 1.1.14
87+
*/
88+
float bigramFrequency(WordWithCodeView prev, WordWithCodeView cur) const;
89+
90+
/**
91+
* Query the raw frequency of the unigram.
92+
*
93+
* @since 1.1.14
94+
*/
95+
int32_t rawUnigramFrequency(WordWithCodeView word) const;
96+
97+
/**
98+
* Query the raw frequency of the bigram.
99+
*
100+
* @since 1.1.14
101+
*/
102+
int32_t rawBigramFrequency(WordWithCodeView prev,
103+
WordWithCodeView cur) const;
104+
62105
private:
63106
std::unique_ptr<HistoryBigramPrivate> d_ptr;
64107
FCITX_DECLARE_PRIVATE(HistoryBigram);

src/libime/core/userlanguagemodel.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class UserLanguageModelPrivate {
3232
bool useOnlyUnigram_ = false;
3333

3434
HistoryBigram history_;
35+
ValidationCodeExtractor extractor_;
3536
float weight_ = DEFAULT_USER_LANGUAGE_MODEL_USER_WEIGHT;
3637
// log(wa * exp(a) + wb * exp(b))
3738
// log(exp(log(wa) + a) + exp(b + log(wb))
@@ -128,7 +129,12 @@ float UserLanguageModel::score(const State &state, const WordNode &word,
128129
score = LanguageModel::score(state, word, out);
129130
}
130131
const auto *prev = d->wordFromState(state);
131-
float userScore = d->history_.score(prev, &word);
132+
float userScore;
133+
if (d->extractor_) {
134+
userScore = d->history_.scoreWithCode(prev, &word, d->extractor_);
135+
} else {
136+
userScore = d->history_.score(prev, &word);
137+
}
132138
d->setWordToState(out, &word);
133139
return std::max(score, sum_log_prob(score + d->wa_, userScore + d->wb_));
134140
}
@@ -170,4 +176,9 @@ bool UserLanguageModel::containsNonUnigram(
170176
return LanguageModel::maxNgramLength(words) > 1;
171177
}
172178

179+
void UserLanguageModel::setCodeExtractor(ValidationCodeExtractor extractor) {
180+
FCITX_D();
181+
d->extractor_ = std::move(extractor);
182+
}
183+
173184
} // namespace libime

src/libime/core/userlanguagemodel.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <string_view>
1414
#include <vector>
1515
#include <fcitx-utils/macros.h>
16+
#include <libime/core/historybigram.h>
1617
#include <libime/core/languagemodel.h>
1718
#include <libime/core/libimecore_export.h>
1819

@@ -48,6 +49,8 @@ class LIBIMECORE_EXPORT UserLanguageModel : public LanguageModel {
4849

4950
bool containsNonUnigram(const std::vector<std::string> &words) const;
5051

52+
void setCodeExtractor(ValidationCodeExtractor extractor);
53+
5154
private:
5255
std::unique_ptr<UserLanguageModelPrivate> d_ptr;
5356
FCITX_DECLARE_PRIVATE(UserLanguageModel);

0 commit comments

Comments
 (0)