Skip to content

Commit df855d8

Browse files
committed
Try to append to history if there is context
Fix #122
1 parent f95faeb commit df855d8

4 files changed

Lines changed: 76 additions & 19 deletions

File tree

src/libime/core/historybigram.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,32 @@ class HistoryBigramPool {
395395
bigram_.fillPredict(words, word, maxSize);
396396
}
397397

398+
bool maybeAppendToLatestSentence(const std::vector<WordWithCode> &context,
399+
std::vector<WordWithCode> &newSentence) {
400+
if (recent_.empty() || newSentence.empty()) {
401+
return false;
402+
}
403+
auto &latestSentence = recent_.front();
404+
if (latestSentence.size() < context.size() ||
405+
!std::ranges::equal(
406+
context,
407+
std::views::drop(latestSentence,
408+
latestSentence.size() - context.size()))) {
409+
return false;
410+
}
411+
412+
const int delta = 1;
413+
decBigram(latestSentence.back(), {"</s>", ""}, delta);
414+
for (auto &item : newSentence) {
415+
unigram_.incFreq(item, delta);
416+
incBigram(latestSentence.back(), item, delta);
417+
latestSentence.push_back(std::move(item));
418+
}
419+
incBigram(latestSentence.back(), {"</s>", ""}, delta);
420+
421+
return true;
422+
}
423+
398424
private:
399425
template <typename R>
400426
void remove(const R &sentence) {
@@ -742,4 +768,13 @@ float HistoryBigram::scoreWithCode(
742768
{cur ? cur->word() : "", extractor && cur ? extractor(cur) : ""});
743769
}
744770

771+
void HistoryBigram::addWithContext(const std::vector<WordWithCode> &context,
772+
std::vector<WordWithCode> newSentence) {
773+
FCITX_D();
774+
if (context.empty() ||
775+
!d->pools_[0].maybeAppendToLatestSentence(context, newSentence)) {
776+
addWithCode(newSentence);
777+
}
778+
}
779+
745780
} // namespace libime

src/libime/core/historybigram.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ class LIBIMECORE_EXPORT HistoryBigram {
102102
int32_t rawBigramFrequency(WordWithCodeView prev,
103103
WordWithCodeView cur) const;
104104

105+
void addWithContext(const std::vector<WordWithCode> &context,
106+
std::vector<WordWithCode> newSentence);
107+
105108
private:
106109
std::unique_ptr<HistoryBigramPrivate> d_ptr;
107110
FCITX_DECLARE_PRIVATE(HistoryBigram);

src/libime/pinyin/pinyincontext.cpp

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -919,10 +919,10 @@ std::vector<std::string> PinyinContext::selectedWords() const {
919919
return newSentence;
920920
}
921921

922-
std::vector<std::pair<std::string, std::string>>
922+
std::vector<HistoryBigram::WordWithCode>
923923
PinyinContext::selectedWordsWithPinyin() const {
924924
FCITX_D();
925-
std::vector<std::pair<std::string, std::string>> newSentence;
925+
std::vector<HistoryBigram::WordWithCode> newSentence;
926926
for (const auto &s : d->selected_) {
927927
for (const auto &item : s) {
928928
if (item.type_ != SelectedPinyinType::Separator) {
@@ -976,31 +976,30 @@ void PinyinContext::learn() {
976976
return;
977977
}
978978

979+
std::vector<HistoryBigram::WordWithCode> newSentence;
979980
if (auto [result, encodedWordPinyin] = d->learnWord();
980981
result != LearnWordResult::Ignored) {
981982
// Do not insert custom to history for the first time.
982983
if (result == LearnWordResult::Normal) {
983984
// Create new sentence with the whole new learned word.
984-
std::vector<HistoryBigram::WordWithCode> newSentence{
985-
{sentence(), encodedWordPinyin}};
986-
d->ime_->model()->history().addWithCode(newSentence);
985+
newSentence.push_back({sentence(), encodedWordPinyin});
986+
} else {
987+
return;
987988
}
988989
} else {
989-
std::vector<HistoryBigram::WordWithCode> newSentence;
990-
for (auto &s : d->selected_) {
991-
for (auto &item : s) {
992-
if (item.type_ != SelectedPinyinType::Separator) {
993-
// Non pinyin word. Skip it.
994-
if (item.encodedPinyin().empty()) {
995-
return;
996-
}
997-
newSentence.push_back(
998-
{item.word_.word(), item.encodedPinyin()});
999-
}
1000-
}
1001-
}
1002-
d->ime_->model()->history().addWithCode(newSentence);
990+
newSentence = selectedWordsWithPinyin();
991+
}
992+
993+
if (std::ranges::any_of(newSentence, [](const auto &word) {
994+
return word.second.empty();
995+
})) {
996+
// Don't add to history if there is any non-pinyin word.
997+
return;
1003998
}
999+
1000+
auto context = contextWordsWithPinyin();
1001+
d->ime_->model()->history().addWithContext(contextWordsWithPinyin(),
1002+
std::move(newSentence));
10041003
}
10051004

10061005
void PinyinContext::setContextWords(

test/testhistorybigram.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <string>
1313
#include <unordered_set>
1414
#include <fcitx-utils/log.h>
15+
#include <fcitx-utils/stringutils.h>
1516
#include "libime/core/historybigram.h"
1617

1718
namespace {
@@ -266,6 +267,24 @@ void testWithCodePredict() {
266267
}
267268
}
268269

270+
void testAppend() {
271+
using namespace libime;
272+
HistoryBigram history;
273+
history.addWithCode({{"", "code1"}, {"", "code2"}, {"一个", "code3"}});
274+
275+
history.addWithContext({{"", "code2"}, {"一个", "code3"}},
276+
{{"好人", "code4"}});
277+
278+
history.addWithContext({{"不是", "code5"}}, {{"你的", "code6"}});
279+
std::stringstream ss;
280+
history.dump(ss);
281+
auto lines = fcitx::stringutils::split(ss.str(), "\n");
282+
FCITX_ASSERT(lines.size() == 2) << lines.size();
283+
FCITX_ASSERT(lines[0] == "你的\tcode6") << lines[0];
284+
FCITX_ASSERT(lines[1] == "\tcode1 是\tcode2 一个\tcode3 好人\tcode4")
285+
<< lines[1];
286+
}
287+
269288
} // namespace
270289

271290
int main() {
@@ -276,5 +295,6 @@ int main() {
276295
testSaveAndLoadText();
277296
testWithCode();
278297
testWithCodePredict();
298+
testAppend();
279299
return 0;
280300
}

0 commit comments

Comments
 (0)