Skip to content

Commit 8a32ab9

Browse files
committed
Update learn word logic: if selected words contains non-unigram, save as sentence instead of words.
1 parent a4fd3f3 commit 8a32ab9

8 files changed

Lines changed: 102 additions & 3 deletions

File tree

src/libime/core/historybigram.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,4 +571,14 @@ void HistoryBigram::fillPredict(std::unordered_set<std::string> &words,
571571
pool.fillPredict(words, lookup, maxSize);
572572
});
573573
}
574+
575+
bool HistoryBigram::containsBigram(std::string_view prev,
576+
std::string_view cur) const {
577+
FCITX_D();
578+
return std::ranges::any_of(d->pools_,
579+
[&prev, &cur](const HistoryBigramPool &pool) {
580+
return pool.bigramFreq(prev, cur) > 0;
581+
});
582+
}
583+
574584
} // namespace libime

src/libime/core/historybigram.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ class LIBIMECORE_EXPORT HistoryBigram {
5757
const std::vector<std::string> &sentence,
5858
size_t maxSize) const;
5959

60+
bool containsBigram(std::string_view prev, std::string_view cur) const;
61+
6062
private:
6163
std::unique_ptr<HistoryBigramPrivate> d_ptr;
6264
FCITX_DECLARE_PRIVATE(HistoryBigram);

src/libime/core/languagemodel.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*/
66

77
#include "languagemodel.h"
8+
#include <algorithm>
89
#include <cassert>
910
#include <cmath>
1011
#include <cstdlib>
@@ -27,9 +28,11 @@
2728
#include "lm/config.hh"
2829
#include "lm/lm_exception.hh"
2930
#include "lm/model.hh"
31+
#include "lm/return.hh"
3032
#include "lm/state.hh"
3133
#include "lm/word_index.hh"
3234
#include "util/string_piece.hh"
35+
#include "utils.h"
3336

3437
namespace libime {
3538

@@ -72,6 +75,8 @@ const DATrie<float> &StaticLanguageModelFile::predictionTrie() const {
7275

7376
static_assert(sizeof(void *) + sizeof(lm::ngram::State) <= StateSize, "Size");
7477

78+
LanguageModelBase::~LanguageModelBase() {}
79+
7580
bool LanguageModelBase::isNodeUnknown(const LatticeNode &node) const {
7681
return isUnknown(node.idx(), node.word());
7782
}
@@ -217,6 +222,32 @@ bool LanguageModel::isUnknown(WordIndex idx, std::string_view /*word*/) const {
217222
return idx == unknown();
218223
}
219224

225+
unsigned int
226+
LanguageModel::maxNgramLength(const std::vector<std::string> &words) const {
227+
FCITX_D();
228+
if (!d->model()) {
229+
return 0;
230+
}
231+
State state = nullState();
232+
State outState;
233+
234+
unsigned int maxNgramLength = 0;
235+
std::vector<WordNode> nodes;
236+
for (const auto &word : words) {
237+
const auto idx = index(word);
238+
lm::FullScoreReturn full =
239+
d->model()->FullScore(lmState(state), idx, lmState(outState));
240+
unsigned int ngramLength = full.ngram_length;
241+
if (ngramLength == 1 && idx == unknown()) {
242+
ngramLength = 0;
243+
}
244+
245+
maxNgramLength = std::max(maxNgramLength, ngramLength);
246+
state = outState;
247+
}
248+
return maxNgramLength;
249+
}
250+
220251
void LanguageModel::setUnknownPenalty(float unknown) {
221252
FCITX_D();
222253
d->unknown_ = unknown;

src/libime/core/languagemodel.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class LanguageModelResolverPrivate;
3232

3333
class LIBIMECORE_EXPORT LanguageModelBase {
3434
public:
35-
virtual ~LanguageModelBase() {}
35+
virtual ~LanguageModelBase();
3636

3737
virtual WordIndex beginSentence() const = 0;
3838
virtual WordIndex endSentence() const = 0;
@@ -89,6 +89,8 @@ class LIBIMECORE_EXPORT LanguageModel : public LanguageModelBase {
8989
void setUnknownPenalty(float unknown);
9090
float unknownPenalty() const;
9191

92+
unsigned int maxNgramLength(const std::vector<std::string> &words) const;
93+
9294
private:
9395
std::unique_ptr<LanguageModelPrivate> d_ptr;
9496
FCITX_DECLARE_PRIVATE(LanguageModel);

src/libime/core/userlanguagemodel.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,13 @@
99
#include <cassert>
1010
#include <cmath>
1111
#include <istream>
12+
#include <iterator>
1213
#include <memory>
1314
#include <ostream>
15+
#include <string>
1416
#include <string_view>
1517
#include <utility>
18+
#include <vector>
1619
#include <fcitx-utils/macros.h>
1720
#include "constants.h"
1821
#include "historybigram.h"
@@ -150,4 +153,21 @@ bool UserLanguageModel::useOnlyUnigram() const {
150153
FCITX_D();
151154
return d->useOnlyUnigram_;
152155
}
156+
157+
bool UserLanguageModel::containsNonUnigram(
158+
const std::vector<std::string> &words) const {
159+
FCITX_D();
160+
if (words.size() <= 1 || d->useOnlyUnigram_) {
161+
return false;
162+
}
163+
164+
for (auto iter = words.begin(); iter != std::prev(words.end()); ++iter) {
165+
if (d->history_.containsBigram(*iter, *(std::next(iter)))) {
166+
return true;
167+
}
168+
}
169+
170+
return LanguageModel::maxNgramLength(words) > 1;
171+
}
172+
153173
} // namespace libime

src/libime/core/userlanguagemodel.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
#include <istream>
1010
#include <memory>
1111
#include <ostream>
12+
#include <string>
1213
#include <string_view>
14+
#include <vector>
1315
#include <fcitx-utils/macros.h>
1416
#include <libime/core/languagemodel.h>
1517
#include <libime/core/libimecore_export.h>
@@ -44,6 +46,8 @@ class LIBIMECORE_EXPORT UserLanguageModel : public LanguageModel {
4446
State &out) const override;
4547
bool isUnknown(WordIndex idx, std::string_view view) const override;
4648

49+
bool containsNonUnigram(const std::vector<std::string> &words) const;
50+
4751
private:
4852
std::unique_ptr<UserLanguageModelPrivate> d_ptr;
4953
FCITX_DECLARE_PRIVATE(UserLanguageModel);

src/libime/pinyin/pinyincontext.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,9 +251,17 @@ class PinyinContextPrivate : public fcitx::QPtrHolder<PinyinContext> {
251251
totalPinyinLength += item.encodedPinyin_.size() / 2;
252252
}
253253
}
254-
if (!isAllSingleWord && !hasCustom && totalPinyinLength > 4) {
255-
return LearnWordResult::Ignored;
254+
255+
FCITX_Q();
256+
if (!hasCustom) {
257+
if ((!isAllSingleWord && totalPinyinLength > 4)) {
258+
return LearnWordResult::Ignored;
259+
}
260+
if (ime_->model()->containsNonUnigram(q->selectedWords())) {
261+
return LearnWordResult::Ignored;
262+
}
256263
}
264+
257265
for (auto &s : selected_) {
258266
for (auto &item : s) {
259267
if (item.type_ == SelectedPinyinType::Separator) {

test/testpinyincontext.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,5 +285,27 @@ int main() {
285285
}
286286
}
287287

288+
{
289+
c.clear();
290+
c.clearContextWords();
291+
FCITX_ASSERT(!ime.model()->history().containsBigram("", ""));
292+
c.type("taai");
293+
size_t i = 0;
294+
for (const auto &candidate : c.candidatesToCursor()) {
295+
if (candidate.toString() == "他爱") {
296+
break;
297+
}
298+
i++;
299+
}
300+
FCITX_ASSERT(i < c.candidatesToCursor().size());
301+
c.selectCandidatesToCursor(i);
302+
303+
FCITX_ASSERT(c.selected());
304+
FCITX_ASSERT(c.selectedSentence() == "他爱");
305+
c.learn();
306+
c.clear();
307+
FCITX_ASSERT(ime.model()->history().containsBigram("", ""));
308+
}
309+
288310
return 0;
289311
}

0 commit comments

Comments
 (0)