From 23d27e18582bb49045d002f83d08e138c468f860 Mon Sep 17 00:00:00 2001 From: Andrey Cheb <37665782+cheb0@users.noreply.github.com> Date: Thu, 11 Jun 2026 12:56:20 +0400 Subject: [PATCH 1/3] fix: add overlap check for TID block in pattern search (#437) --- frac/sealed/token/provider.go | 4 ++++ frac/sealed/token/table_entry.go | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/frac/sealed/token/provider.go b/frac/sealed/token/provider.go index 9decae91..0d0e177e 100644 --- a/frac/sealed/token/provider.go +++ b/frac/sealed/token/provider.go @@ -75,6 +75,10 @@ func (tp *Provider) findInBlocks(firstTID, lastTID uint32, search func(*Block, i var tids []uint32 for _, entry := range tp.entries { + if !entry.checkTIDsInBlock(firstTID, lastTID) { + continue + } + block := tp.findBlock(entry.BlockIndex) firstIndex, lastIndex := entry.narrowIndexes(firstTID, lastTID) indexes, err := search(block, firstIndex, lastIndex) diff --git a/frac/sealed/token/table_entry.go b/frac/sealed/token/table_entry.go index 244e8a41..6cf01486 100644 --- a/frac/sealed/token/table_entry.go +++ b/frac/sealed/token/table_entry.go @@ -34,6 +34,18 @@ func (t *TableEntry) narrowIndexes(firstTID, lastTID uint32) (int, int) { return firstIndex, lastIndex } +func (t *TableEntry) checkTIDsInBlock(firstTID, lastTID uint32) bool { + if lastTID < t.StartTID { + return false + } + + if firstTID > t.getLastTID() { + return false + } + + return true +} + func (t *TableEntry) checkTIDInBlock(tid uint32) bool { if tid < t.StartTID { return false From cabafa090902ac9ec3717eee246399724661c701 Mon Sep 17 00:00:00 2001 From: Andrei Cheboksarov <37665782+cheb0@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:40:30 +0400 Subject: [PATCH 2/3] make token block configurable --- cmd/seq-db/seq-db.go | 1 + config/config.go | 5 +++++ config/validation.go | 1 + config/validation_test.go | 15 +++++++++++++++ docs/en/02-configuration.md | 6 ++++++ docs/ru/02-configuration.md | 6 ++++++ frac/common/seal_params.go | 1 + frac/fraction_test.go | 1 + fracmanager/config.go | 3 +++ indexwriter/index.go | 12 ++++++++---- sealing/sealer.go | 4 ++++ 11 files changed, 51 insertions(+), 4 deletions(-) diff --git a/cmd/seq-db/seq-db.go b/cmd/seq-db/seq-db.go index de29ac4c..d607e691 100644 --- a/cmd/seq-db/seq-db.go +++ b/cmd/seq-db/seq-db.go @@ -267,6 +267,7 @@ func startStore( IDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, LIDsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, LIDBlockSize: cfg.Sealing.Lids.BlockSize, + TokenBlockSize: int(cfg.Sealing.Tokens.BlockSize), TokenListZstdLevel: cfg.Compression.SealedZstdCompressionLevel, DocsPositionsZstdLevel: cfg.Compression.SealedZstdCompressionLevel, TokenTableZstdLevel: cfg.Compression.SealedZstdCompressionLevel, diff --git a/config/config.go b/config/config.go index 0d929a7f..91049312 100644 --- a/config/config.go +++ b/config/config.go @@ -71,6 +71,11 @@ type Config struct { } `config:"storage"` Sealing struct { + Tokens struct { + // BlockSize sets max token block size in bytes. + BlockSize Bytes `config:"block_size" default:"16KiB"` + } `config:"tokens"` + Lids struct { // BlockSize sets max lids (postings) saved per LIDs block. BlockSize int `config:"block_size" default:"65536"` diff --git a/config/validation.go b/config/validation.go index c305a706..e1f160ac 100644 --- a/config/validation.go +++ b/config/validation.go @@ -70,6 +70,7 @@ func (c *Config) storeValidations() []validateFn { inRange("compression.doc_block_zstd_compression_level", -7, 22, c.Compression.DocBlockZstdCompressionLevel), greaterThan("sealing.lids.block_size", 0, c.Sealing.Lids.BlockSize), lessOrEqThan("sealing.lids.block_size", 65536, c.Sealing.Lids.BlockSize), + greaterThan("sealing.tokens.block_size", 0, c.Sealing.Tokens.BlockSize), inRange("offloading.queue_size_percent", 0, 100, c.Offloading.QueueSizePercent), greaterThan("experimental.max_regex_tokens_check", -1, c.Experimental.MaxRegexTokensCheck), diff --git a/config/validation_test.go b/config/validation_test.go index b0813697..885c20ff 100644 --- a/config/validation_test.go +++ b/config/validation_test.go @@ -98,6 +98,21 @@ limits: env: map[string]string{"SEQDB_SEALING_LIDS_BLOCK_SIZE": "8192"}, expectErr: false, }, + { + name: "Invalid sealing.tokens.block_size", + cfg: createCfgFile(t, base+` +sealing: + tokens: + block_size: -1B +`), + expectErr: true, + }, + { + name: "Valid sealing.tokens.block_size", + cfg: baseCfg, + env: map[string]string{"SEQDB_SEALING_TOKENS_BLOCK_SIZE": "32KiB"}, + expectErr: false, + }, } for _, tt := range tests { diff --git a/docs/en/02-configuration.md b/docs/en/02-configuration.md index 2be41678..bd1108c0 100644 --- a/docs/en/02-configuration.md +++ b/docs/en/02-configuration.md @@ -115,6 +115,12 @@ Compression level settings for various data types. Settings for fraction sealing. +### Tokens + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `sealing.tokens.block_size` | Bytes | `16KiB` | Max token block size in bytes | + ### Lids | Field | Type | Default | Description | diff --git a/docs/ru/02-configuration.md b/docs/ru/02-configuration.md index 6b91ff14..9e33fe5d 100644 --- a/docs/ru/02-configuration.md +++ b/docs/ru/02-configuration.md @@ -115,6 +115,12 @@ id: configuration Настройки запечатывания фракций. +### Tokens + +| Параметр | Тип | Значение по умолчанию | Описание | +|----------|-----|----------------------|-----------| +| `sealing.tokens.block_size` | Bytes | `16KiB` | Максимальный размер блока токенов в байтах | + ### Lids | Параметр | Тип | Значение по умолчанию | Описание | diff --git a/frac/common/seal_params.go b/frac/common/seal_params.go index 05f89696..b887c29b 100644 --- a/frac/common/seal_params.go +++ b/frac/common/seal_params.go @@ -9,5 +9,6 @@ type SealParams struct { DocBlocksZstdLevel int // DocBlocksZstdLevel is the zstd compress level of each document block. LIDBlockSize int + TokenBlockSize int DocBlockSize int // DocBlockSize is decompressed payload size of document block. } diff --git a/frac/fraction_test.go b/frac/fraction_test.go index d5fc39ed..13731e16 100644 --- a/frac/fraction_test.go +++ b/frac/fraction_test.go @@ -99,6 +99,7 @@ func (s *FractionTestSuite) SetupTestCommon() { TokenTableZstdLevel: 1, DocBlocksZstdLevel: 1, LIDBlockSize: 512, + TokenBlockSize: 128, DocBlockSize: 128 * int(units.KiB), } diff --git a/fracmanager/config.go b/fracmanager/config.go index e295aada..acef2911 100644 --- a/fracmanager/config.go +++ b/fracmanager/config.go @@ -60,6 +60,9 @@ func FillConfigWithDefault(config *Config) *Config { if config.SealParams.LIDBlockSize == 0 { config.SealParams.LIDBlockSize = consts.DefaultLIDBlockCap } + if config.SealParams.TokenBlockSize == 0 { + config.SealParams.TokenBlockSize = consts.RegularBlockSize + } if config.SealParams.TokenListZstdLevel == 0 { config.SealParams.TokenListZstdLevel = zstdDefaultLevel } diff --git a/indexwriter/index.go b/indexwriter/index.go index c28c7b6e..db829413 100644 --- a/indexwriter/index.go +++ b/indexwriter/index.go @@ -67,12 +67,16 @@ type IndexWriter struct { } func New(params common.SealParams) *IndexWriter { + if params.TokenBlockSize == 0 { + params.TokenBlockSize = consts.RegularBlockSize + } + return &IndexWriter{ params: params, - buf1: make([]byte, 0, consts.RegularBlockSize), - buf2: make([]byte, 0, consts.RegularBlockSize), + buf1: make([]byte, 0, params.TokenBlockSize), + buf2: make([]byte, 0, params.TokenBlockSize), buf32: make([]uint32, 0, consts.DefaultLIDBlockCap), - buf64: make([]uint64, 0, consts.RegularBlockSize), + buf64: make([]uint64, 0, params.TokenBlockSize), } } @@ -153,7 +157,7 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err ) var allFieldsTables []token.FieldTable - for pair, err := range tokenBlock(src.TokenTriplet(), lidAccumulator.add, consts.RegularBlockSize) { + for pair, err := range tokenBlock(src.TokenTriplet(), lidAccumulator.add, s.params.TokenBlockSize) { if err != nil { return err } diff --git a/sealing/sealer.go b/sealing/sealer.go index 0c21ffc4..fd90c1d8 100644 --- a/sealing/sealer.go +++ b/sealing/sealer.go @@ -20,6 +20,10 @@ type Source = indexwriter.Source // and returns PreloadedData for fast initialization of the sealed fraction. func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { info := src.Info() + info.ConstRegularBlockSize = params.TokenBlockSize + if info.ConstRegularBlockSize == 0 { + info.ConstRegularBlockSize = consts.RegularBlockSize + } if info.To == 0 { return nil, errors.New("sealing of an empty active fraction is not supported") From 84b185b66df52c7cee23fc83db860c4059374afb Mon Sep 17 00:00:00 2001 From: Andrei Cheboksarov <37665782+cheb0@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:44:14 +0400 Subject: [PATCH 3/3] make token block configurable --- frac/fraction_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frac/fraction_test.go b/frac/fraction_test.go index 13731e16..209b492d 100644 --- a/frac/fraction_test.go +++ b/frac/fraction_test.go @@ -1902,11 +1902,11 @@ func (s *FractionTestSuite) TestFractionInfo() { s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") case *Sealed: s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") - s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1450), + s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1500), "index on disk doesn't match. actual value: %d", info.IndexOnDisk) case *Remote: s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") - s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1450), + s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1500), "index on disk doesn't match. actual value: %d", info.IndexOnDisk) default: s.Require().Fail("unsupported fraction type")