From d241ea84922dd2ee86ed75646486c5a15130999b Mon Sep 17 00:00:00 2001 From: ykiko Date: Mon, 20 Apr 2026 02:45:34 +0800 Subject: [PATCH] refactor(index): migrate FlatBuffers from flatc IDL to kotatsu reflection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the flatc-generated serialization layer with kotatsu's arena codec driven directly by the in-memory index types. No hand-written DTOs: the on-wire layout is derived from reflection over the existing structs, with type-level customization where needed. - Drop `schema.fbs`, `serialization.h`, and the flatc build step - Delete `wire_types.h` — no more parallel wire representation - Add `kotatsu_adapters.h` with `kota::codec::type_adapter` specializations for RelationKind, SymbolKind, Bitmap, and std::chrono::milliseconds - Mark runtime-only FileID-keyed maps with `kota::meta::skip<>` so they are excluded from reflection slots; serialize via `main_file_index` and `path_file_indices` (keyed by path id) - Restore MergedIndex's dual dispatch: in-memory path when `impl` is live, lazy flatbuffers path via `kfb::table_view::from_bytes()` and `root[&Impl::field]` proxy access when only the buffer is held - Add default member initializers to LocalSourceRange, padding field to Relation, and a path_id lookup struct to IncludeLocation so reflection picks up all stored state - Propagate buffer size through `TUIndex::from` / `ProjectIndex::from` (kota codec requires an explicit size for bounds verification) All 551 unit tests pass; 9 environment-gated integration tests skipped. Co-Authored-By: Claude Opus 4.6 --- CMakeLists.txt | 23 +- cmake/package.cmake | 16 +- src/index/include_graph.h | 6 +- src/index/kotatsu_adapters.h | 121 +++++++++ src/index/merged_index.cpp | 306 +++++++++-------------- src/index/project_index.cpp | 92 ++----- src/index/project_index.h | 70 +++++- src/index/schema.fbs | 173 ------------- src/index/serialization.h | 79 ------ src/index/tu_index.cpp | 143 +++-------- src/index/tu_index.h | 19 +- src/semantic/relation_kind.h | 4 + src/server/compiler.cpp | 3 +- src/server/indexer.cpp | 5 +- src/syntax/token.h | 2 + tests/unit/index/project_index_tests.cpp | 4 +- 16 files changed, 396 insertions(+), 670 deletions(-) create mode 100644 src/index/kotatsu_adapters.h delete mode 100644 src/index/schema.fbs delete mode 100644 src/index/serialization.h diff --git a/CMakeLists.txt b/CMakeLists.txt index bbca11d60..915b864b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,42 +124,21 @@ if(CLICE_CI_ENVIRONMENT) target_compile_definitions(clice_options INTERFACE CLICE_CI_ENVIRONMENT=1) endif() -set(FBS_SCHEMA_FILE "${PROJECT_SOURCE_DIR}/src/index/schema.fbs") -set(GENERATED_HEADER "${PROJECT_BINARY_DIR}/generated/schema_generated.h") - -if(CMAKE_CROSSCOMPILING) - find_program(FLATC_EXECUTABLE flatc REQUIRED) - set(FLATC_CMD "${FLATC_EXECUTABLE}") -else() - set(FLATC_CMD "$") -endif() - -add_custom_command( - OUTPUT "${GENERATED_HEADER}" - COMMAND ${FLATC_CMD} --cpp -o "${PROJECT_BINARY_DIR}/generated" "${FBS_SCHEMA_FILE}" - DEPENDS "${FBS_SCHEMA_FILE}" - COMMENT "Generating C++ header from ${FBS_SCHEMA_FILE}" -) - -add_custom_target(generate_flatbuffers_schema DEPENDS "${GENERATED_HEADER}") - file(GLOB_RECURSE CLICE_CORE_SOURCES CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/src/*.cpp") add_library(clice-core STATIC ${CLICE_CORE_SOURCES}) add_library(clice::core ALIAS clice-core) -add_dependencies(clice-core generate_flatbuffers_schema) target_include_directories(clice-core PUBLIC "${PROJECT_SOURCE_DIR}/src" - "${PROJECT_BINARY_DIR}/generated" ) target_link_libraries(clice-core PUBLIC clice_options llvm-libs spdlog::spdlog roaring::roaring - flatbuffers kota::ipc::lsp kota::codec::toml + kota::codec::flatbuffers simdjson::simdjson ) diff --git a/cmake/package.cmake b/cmake/package.cmake index c056e75df..2dff8cd9d 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -27,21 +27,10 @@ FetchContent_Declare( set(ENABLE_ROARING_TESTS OFF CACHE INTERNAL "" FORCE) set(ENABLE_ROARING_MICROBENCHMARKS OFF CACHE INTERNAL "" FORCE) -# flatbuffers -FetchContent_Declare( - flatbuffers - GIT_REPOSITORY https://github.com/google/flatbuffers.git - GIT_TAG v25.9.23 - GIT_SHALLOW TRUE -) -set(FLATBUFFERS_BUILD_GRPC OFF CACHE BOOL "" FORCE) -set(FLATBUFFERS_BUILD_TESTS OFF CACHE BOOL "" FORCE) -set(FLATBUFFERS_BUILD_FLATHASH OFF CACHE BOOL "" FORCE) - FetchContent_Declare( kotatsu GIT_REPOSITORY https://github.com/clice-io/kotatsu - GIT_TAG main + GIT_TAG refactor/flatbuffers-schema-driven GIT_SHALLOW TRUE ) @@ -50,7 +39,8 @@ set(KOTA_ENABLE_TEST OFF) set(KOTA_CODEC_ENABLE_SIMDJSON ON) set(KOTA_CODEC_ENABLE_YYJSON ON) set(KOTA_CODEC_ENABLE_TOML ON) +set(KOTA_CODEC_ENABLE_FLATBUFFERS ON) set(KOTA_ENABLE_EXCEPTIONS OFF) set(KOTA_ENABLE_RTTI OFF) -FetchContent_MakeAvailable(kotatsu spdlog croaring flatbuffers) +FetchContent_MakeAvailable(kotatsu spdlog croaring) diff --git a/src/index/include_graph.h b/src/index/include_graph.h index 8e9da218c..4af5209d7 100644 --- a/src/index/include_graph.h +++ b/src/index/include_graph.h @@ -7,6 +7,7 @@ #include "syntax/token.h" +#include "kota/meta/annotation.h" #include "llvm/ADT/DenseMap.h" namespace clice { @@ -42,7 +43,10 @@ struct IncludeGraph { /// Each `FileID` represents a new header context and is introduced /// by a new include directive. So a include directive is a new header /// context. A map between FileID and its include location. - llvm::DenseMap file_table; + /// + /// Runtime-only: `clang::FileID` is an AST-scoped handle; on-disk the + /// include graph is fully described by `paths` + `locations`. + kota::meta::skip> file_table; static IncludeGraph from(CompilationUnitRef unit); diff --git a/src/index/kotatsu_adapters.h b/src/index/kotatsu_adapters.h new file mode 100644 index 000000000..550c7f968 --- /dev/null +++ b/src/index/kotatsu_adapters.h @@ -0,0 +1,121 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "semantic/relation_kind.h" +#include "semantic/symbol_kind.h" +#include "support/bitmap.h" + +#include "kota/codec/arena/traits.h" +#include "kota/codec/detail/fwd.h" + +/// Type-level wire traits for clice index types. +/// +/// These partially specialize the primary +/// `kota::codec::serialize_traits` / `deserialize_traits` +/// templates, constrained so only arena backends pick them up. They +/// declare the wire representation for `T` and propagate through map +/// values, sequence elements, and nested containers — no per-field +/// `annotation>` required. + +namespace kota::codec { + +/// `std::chrono::milliseconds` ⇄ `int64` tick count. +template + requires arena::arena_serializer_like +struct serialize_traits { + using wire_type = std::int64_t; + + static std::int64_t serialize(S&, std::chrono::milliseconds value) noexcept { + return value.count(); + } +}; + +template + requires arena::arena_deserializer_like +struct deserialize_traits { + using wire_type = std::int64_t; + + static std::chrono::milliseconds deserialize(const D&, std::int64_t value) noexcept { + return std::chrono::milliseconds(value); + } +}; + +/// `RelationKind` ⇄ underlying `uint32` bitflags. +template + requires arena::arena_serializer_like +struct serialize_traits { + using wire_type = std::uint32_t; + + static std::uint32_t serialize(S&, const clice::RelationKind& k) noexcept { + return k.value(); + } +}; + +template + requires arena::arena_deserializer_like +struct deserialize_traits { + using wire_type = std::uint32_t; + + static clice::RelationKind deserialize(const D&, std::uint32_t v) noexcept { + return clice::RelationKind(static_cast(v)); + } +}; + +/// `SymbolKind` ⇄ underlying `uint8`. +template + requires arena::arena_serializer_like +struct serialize_traits { + using wire_type = std::uint8_t; + + static std::uint8_t serialize(S&, const clice::SymbolKind& k) noexcept { + return k.value(); + } +}; + +template + requires arena::arena_deserializer_like +struct deserialize_traits { + using wire_type = std::uint8_t; + + static clice::SymbolKind deserialize(const D&, std::uint8_t v) noexcept { + return clice::SymbolKind(v); + } +}; + +/// `clice::Bitmap` (= `roaring::Roaring`) ⇄ opaque byte blob produced by +/// Roaring's non-portable serialization (matches the legacy wire format). +template + requires arena::arena_serializer_like +struct serialize_traits { + using wire_type = std::vector; + + static std::vector serialize(S&, const clice::Bitmap& bitmap) { + std::vector buffer; + if(bitmap.isEmpty()) { + return buffer; + } + buffer.resize(bitmap.getSizeInBytes(false)); + bitmap.write(reinterpret_cast(buffer.data()), false); + return buffer; + } +}; + +template + requires arena::arena_deserializer_like +struct deserialize_traits { + using wire_type = std::vector; + + static clice::Bitmap deserialize(const D&, std::vector bytes) { + if(bytes.empty()) { + return clice::Bitmap(); + } + return clice::Bitmap::read(reinterpret_cast(bytes.data()), false); + } +}; + +} // namespace kota::codec diff --git a/src/index/merged_index.cpp b/src/index/merged_index.cpp index b8c2dbbf2..2fc950432 100644 --- a/src/index/merged_index.cpp +++ b/src/index/merged_index.cpp @@ -1,11 +1,18 @@ #include "index/merged_index.h" +#include +#include #include +#include #include -#include "index/serialization.h" +#include "index/kotatsu_adapters.h" // type_adapter specializations #include "support/filesystem.h" +#include "kota/codec/flatbuffers/deserializer.h" +#include "kota/codec/flatbuffers/proxy.h" +#include "kota/codec/flatbuffers/serializer.h" +#include "kota/meta/annotation.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Support/raw_os_ostream.h" @@ -97,7 +104,7 @@ struct CompilationContext { std::uint32_t canonical_id = 0; - std::uint64_t build_at; + std::uint64_t build_at = 0; std::vector include_locations; @@ -125,8 +132,9 @@ struct MergedIndex::Impl { /// The max canonical id we have allocated. std::uint32_t max_canonical_id = 0; - /// The reference count of each canonical id. - std::vector canonical_ref_counts; + /// Reference counts per canonical id — derivable from header/compilation + /// contexts at load time, so it doesn't need to live on the wire. + kota::meta::skip> canonical_ref_counts; /// The canonical id set of removed index. roaring::Roaring removed; @@ -137,8 +145,8 @@ struct MergedIndex::Impl { /// All merged symbol relations. llvm::DenseMap> relations; - /// Sorted occurrences cache for fast lookup. - std::vector occurrences_cache; + /// Sorted occurrences cache for fast lookup — rebuilt on demand. + kota::meta::skip> occurrences_cache; void merge(this Impl& self, std::uint32_t path_id, FileIndex& index, auto&& add_context) { auto hash = index.hash(); @@ -172,6 +180,18 @@ struct MergedIndex::Impl { friend bool operator==(const Impl&, const Impl&) = default; }; +namespace { + +namespace kfb = kota::codec::flatbuffers; + +std::span buffer_bytes(const llvm::MemoryBuffer& buffer) { + return std::span( + reinterpret_cast(buffer.getBufferStart()), + buffer.getBufferSize()); +} + +} // namespace + MergedIndex::MergedIndex(std::unique_ptr buffer, std::unique_ptr impl) : buffer(std::move(buffer)), impl(std::move(impl)) {} @@ -196,65 +216,24 @@ void MergedIndex::load_in_memory(this Self& self) { return; } - auto& index = *self.impl; - auto root = fbs::GetRoot(self.buffer->getBufferStart()); - - index.max_canonical_id = root->max_canonical_id(); - - for(auto entry: *root->canonical_cache()) { - index.canonical_cache.try_emplace(entry->sha256()->string_view(), entry->canonical_id()); + auto bytes = buffer_bytes(*self.buffer); + auto result = kfb::from_flatbuffer(bytes, *self.impl); + if(!result) { + self.buffer.reset(); + return; } + // Rebuild the ref count table from the already-loaded contexts. + auto& index = *self.impl; + index.canonical_ref_counts.clear(); index.canonical_ref_counts.resize(index.max_canonical_id, 0); - - for(auto entry: *root->header_contexts()) { - HeaderContext context; - auto path = entry->path_id(); - context.version = entry->version(); - for(auto include: *entry->includes()) { - index.canonical_ref_counts[include->canonical_id()] += 1; - context.includes.emplace_back(*safe_cast(include)); + for(auto& [_, ctx]: index.header_contexts) { + for(auto& inc: ctx.includes) { + index.canonical_ref_counts[inc.canonical_id] += 1; } - index.header_contexts.try_emplace(path, std::move(context)); } - - for(auto entry: *root->compilation_contexts()) { - CompilationContext context; - auto path = entry->path_id(); - context.version = entry->version(); - context.canonical_id = entry->canonical_id(); - context.build_at = entry->build_at(); - for(auto include: *entry->include_locations()) { - context.include_locations.emplace_back(*safe_cast(include)); - } - index.compilation_contexts.try_emplace(path, std::move(context)); - } - - // Count ref counts from compilation contexts. - for(auto entry: *root->compilation_contexts()) { - index.canonical_ref_counts[entry->canonical_id()] += 1; - } - - // Deserialize removed bitmap. - if(root->removed() && root->removed()->size() > 0) { - index.removed = read_bitmap(root->removed()); - } - - for(auto entry: *root->occurrences()) { - index.occurrences.try_emplace(*safe_cast(entry->occurrence()), - read_bitmap(entry->context())); - } - - for(auto entry: *root->relations()) { - auto& relations = index.relations[entry->symbol()]; - for(auto relation_entry: *entry->relations()) { - relations.try_emplace(*safe_cast(relation_entry->relation()), - read_bitmap(relation_entry->context())); - } - } - - if(root->content()) { - index.content = root->content()->str(); + for(auto& [_, ctx]: index.compilation_contexts) { + index.canonical_ref_counts[ctx.canonical_id] += 1; } self.buffer.reset(); @@ -279,100 +258,9 @@ void MergedIndex::serialize(this const Self& self, llvm::raw_ostream& out) { return; } - auto& index = self.impl; - - fbs::FlatBufferBuilder builder(1024); - - llvm::SmallVector buffer; - - auto canonical_cache = transform(index->canonical_cache, [&](auto&& value) { - auto&& [hash, canonical_id] = value; - return binary::CreateCacheEntry(builder, CreateString(builder, hash), canonical_id); - }); - - auto header_contexts = transform(index->header_contexts, [&](auto&& value) { - auto& [path_id, context] = value; - return binary::CreateHeaderContextEntry( - builder, - path_id, - context.version, - CreateStructVector(builder, context.includes)); - }); - - auto compilation_contexts = transform(index->compilation_contexts, [&](auto&& value) { - auto& [path_id, context] = value; - return binary::CreateCompilationContextEntry( - builder, - path_id, - context.version, - context.canonical_id, - context.build_at, - CreateStructVector(builder, context.include_locations)); - }); - - llvm::SmallVector occurrence_keys; - occurrence_keys.reserve(index->occurrences.size()); - auto occurrences = transform(index->occurrences, [&](auto&& value) { - auto&& [occurrence, bitmap] = value; - buffer.clear(); - buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); - bitmap.write(buffer.data(), false); - occurrence_keys.emplace_back(&occurrence); - return binary::CreateOccurrenceEntry(builder, - safe_cast(&occurrence), - CreateVector(builder, buffer)); - }); - std::ranges::sort(std::views::zip(occurrence_keys, occurrences), [](auto lhs, auto rhs) { - const auto& lo = *std::get<0>(lhs); - const auto& ro = *std::get<0>(rhs); - return std::tuple(lo.range.begin, lo.range.end, lo.target) < - std::tuple(ro.range.begin, ro.range.end, ro.target); - }); - - llvm::SmallVector relation_keys; - relation_keys.reserve(index->relations.size()); - auto relations = transform(index->relations, [&](auto&& value) { - auto&& [symbol_id, symbol_relations] = value; - auto relations = transform(symbol_relations, [&](auto&& value) { - auto&& [relation, bitmap] = value; - buffer.clear(); - buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); - bitmap.write(buffer.data(), false); - return binary::CreateRelationEntry(builder, - safe_cast(&relation), - CreateVector(builder, buffer)); - }); - relation_keys.emplace_back(symbol_id); - return binary::CreateSymbolRelationsEntry(builder, - symbol_id, - CreateVector(builder, relations)); - }); - std::ranges::sort(std::views::zip(relation_keys, relations), {}, [](auto e) { - return std::get<0>(e); - }); - - // Serialize removed bitmap. - buffer.clear(); - if(!index->removed.isEmpty()) { - buffer.resize_for_overwrite(index->removed.getSizeInBytes(false)); - index->removed.write(buffer.data(), false); - } - auto removed = CreateVector(builder, buffer); - - auto content_offset = CreateString(builder, index->content); - - auto merged_index = binary::CreateMergedIndex(builder, - index->max_canonical_id, - CreateVector(builder, canonical_cache), - CreateVector(builder, header_contexts), - CreateVector(builder, compilation_contexts), - CreateVector(builder, occurrences), - CreateVector(builder, relations), - removed, - content_offset); - builder.Finish(merged_index); - - out.write(safe_cast(builder.GetBufferPointer()), builder.GetSize()); + auto bytes = kfb::to_flatbuffer(*self.impl); + assert(bytes && "MergedIndex flatbuffer serialization failed"); + out.write(reinterpret_cast(bytes->data()), bytes->size()); } void MergedIndex::lookup(this const Self& self, @@ -420,25 +308,43 @@ void MergedIndex::lookup(this const Self& self, break; } } else if(self.buffer) { - auto index = fbs::GetRoot(self.buffer->getBufferStart()); - auto& occurrences = *index->occurrences(); - - auto it = std::ranges::lower_bound(occurrences, offset, {}, [](auto o) { - return o->occurrence()->range().end(); - }); - - while(it != occurrences.end()) { - auto o = safe_cast(it->occurrence()); - if(o->range.contains(offset)) { - if(!callback(*o)) { - break; - } + // Lazy path: binary-search the sorted occurrences array directly in + // the flatbuffer without materializing the in-memory Impl. + auto root = kfb::table_view::from_bytes(buffer_bytes(*self.buffer)); + auto entries = root[&Impl::occurrences]; + + auto read_occurrence = [](auto occ_view) -> Occurrence { + auto range_view = occ_view[&Occurrence::range]; + return Occurrence{ + LocalSourceRange{range_view[&LocalSourceRange::begin], + range_view[&LocalSourceRange::end]}, + occ_view[&Occurrence::target], + }; + }; - it++; - continue; + const std::size_t count = entries.size(); + std::size_t lo = 0; + std::size_t hi = count; + while(lo < hi) { + auto mid = lo + (hi - lo) / 2; + auto entry = entries.at(mid); + auto range_view = entry.template get<0>()[&Occurrence::range]; + if(range_view[&LocalSourceRange::end] < offset) { + lo = mid + 1; + } else { + hi = mid; } + } - break; + for(; lo < count; ++lo) { + auto entry = entries.at(lo); + auto occurrence = read_occurrence(entry.template get<0>()); + if(!occurrence.range.contains(offset)) { + break; + } + if(!callback(occurrence)) { + break; + } } } } @@ -470,18 +376,31 @@ void MergedIndex::lookup(this const Self& self, } } } else if(self.buffer) { - auto index = fbs::GetRoot(self.buffer->getBufferStart()); - auto& entries = *index->relations(); - - auto it = std::ranges::lower_bound(entries, symbol, {}, [](auto e) { return e->symbol(); }); - if(it == entries.end() || it->symbol() != symbol) [[unlikely]] { + // Lazy path: binary-search the outer relations map and iterate the + // inner map without materializing Impl. + auto root = kfb::table_view::from_bytes(buffer_bytes(*self.buffer)); + auto outer = root[&Impl::relations]; + auto entry = outer.find(symbol); + if(!entry) { return; } - - for(auto entry: *it->relations()) { - auto r = safe_cast(entry->relation()); - if(r->kind & kind) { - if(!callback(*r)) { + auto inner = entry->template get<1>(); + const std::size_t count = inner.size(); + for(std::size_t i = 0; i < count; ++i) { + auto rel_view = inner.at(i).template get<0>(); + // Kind comes back as the wire uint32 via the type_adapter; rewrap it. + auto relation_kind = + RelationKind(static_cast(rel_view[&Relation::kind])); + if(relation_kind & kind) { + auto range_view = rel_view[&Relation::range]; + Relation relation{ + .kind = relation_kind, + .padding = rel_view[&Relation::padding], + .range = LocalSourceRange{range_view[&LocalSourceRange::begin], + range_view[&LocalSourceRange::end]}, + .target_symbol = rel_view[&Relation::target_symbol], + }; + if(!callback(relation)) { break; } } @@ -516,25 +435,31 @@ bool MergedIndex::need_update(this const Self& self, llvm::ArrayRef(self.buffer->getBufferStart()); - if(index->compilation_contexts()->empty()) { + auto root = kfb::table_view::from_bytes(buffer_bytes(*self.buffer)); + auto contexts = root[&Impl::compilation_contexts]; + if(contexts.empty()) { return true; } - auto context = *index->compilation_contexts()->begin(); + auto context = contexts.at(0).template get<1>(); + auto build_at = context[&CompilationContext::build_at]; + auto include_locations = context[&CompilationContext::include_locations]; llvm::DenseSet deps; - for(auto location: *context->include_locations()) { - auto [_, success] = deps.insert(location->path_id()); + const std::size_t count = include_locations.size(); + for(std::size_t i = 0; i < count; ++i) { + auto location = include_locations.at(i); + auto path_id = location[&IncludeLocation::path_id]; + auto [_, success] = deps.insert(path_id); if(success) { fs::file_status status; - if(auto err = fs::status(path_mapping[location->path_id()], status)) { + if(auto err = fs::status(path_mapping[path_id], status)) { return true; } auto time = std::chrono::duration_cast( status.getLastModificationTime().time_since_epoch()); - if(time.count() > context->build_at()) { + if(time.count() > build_at) { return true; } } @@ -616,10 +541,9 @@ llvm::StringRef MergedIndex::content(this const Self& self) { if(self.impl) { return self.impl->content; } else if(self.buffer) { - auto root = fbs::GetRoot(self.buffer->getBufferStart()); - if(root->content()) { - return root->content()->string_view(); - } + auto root = kfb::table_view::from_bytes(buffer_bytes(*self.buffer)); + auto view = root[&Impl::content]; + return llvm::StringRef(view.data(), view.size()); } return {}; } diff --git a/src/index/project_index.cpp b/src/index/project_index.cpp index 087413838..c600900aa 100644 --- a/src/index/project_index.cpp +++ b/src/index/project_index.cpp @@ -1,9 +1,22 @@ #include "index/project_index.h" -#include "index/serialization.h" +#include +#include +#include + +#include "index/kotatsu_adapters.h" // type_adapter specializations + +#include "kota/codec/flatbuffers/deserializer.h" +#include "kota/codec/flatbuffers/serializer.h" namespace clice::index { +namespace { + +namespace kfb = kota::codec::flatbuffers; + +} // namespace + llvm::SmallVector ProjectIndex::merge(this ProjectIndex& self, TUIndex& index) { auto& paths = index.graph.paths; llvm::SmallVector file_ids_map; @@ -28,79 +41,22 @@ llvm::SmallVector ProjectIndex::merge(this ProjectIndex& self, TU } void ProjectIndex::serialize(this ProjectIndex& self, llvm::raw_ostream& os) { - fbs::FlatBufferBuilder builder(1024); - - llvm::SmallVector buffer; - - auto i = 0; - auto paths = transform(self.path_pool.paths, [&](llvm::StringRef path) { - auto entry = - binary::CreatePathEntry(builder, CreateString(builder, self.path_pool.paths[i]), i); - i += 1; - return entry; - }); - - auto indices = transform(self.indices, [&](auto&& value) { - auto&& [source, index] = value; - return binary::PathMapEntry(source, index); - }); - - auto symbols = transform(self.symbols, [&](auto&& value) { - auto& [symbol_id, symbol] = value; - - buffer.clear(); - buffer.resize_for_overwrite(symbol.reference_files.getSizeInBytes(false)); - symbol.reference_files.write(buffer.data(), false); - - return binary::CreateSymbolEntry(builder, - symbol_id, - binary::CreateSymbol(builder, - CreateString(builder, symbol.name), - symbol.kind.value(), - CreateVector(builder, buffer))); - }); - - auto project_index = - binary::CreateProjectIndex(builder, - CreateVector(builder, paths), - CreateStructVector(builder, indices), - CreateVector(builder, symbols)); - - builder.Finish(project_index); - os.write(safe_cast(builder.GetBufferPointer()), builder.GetSize()); + auto bytes = kfb::to_flatbuffer(self); + assert(bytes && "ProjectIndex flatbuffer serialization failed"); + os.write(reinterpret_cast(bytes->data()), bytes->size()); } -ProjectIndex ProjectIndex::from(const void* data) { - auto root = fbs::GetRoot(data); - +ProjectIndex ProjectIndex::from(const void* data, std::size_t size) { ProjectIndex index; - - auto& pool = index.path_pool; - pool.paths.resize(root->paths()->size()); - for(auto entry: *root->paths()) { - // Normalize backslashes to forward slashes for cross-platform consistency - // (persisted index may contain native-separator paths from Windows). - llvm::SmallString<256> normalized(entry->path()->string_view()); - std::replace(normalized.begin(), normalized.end(), '\\', '/'); - auto k = pool.save(normalized.str()); - pool.paths[entry->id()] = k; - pool.cache.try_emplace(k, entry->id()); + if(data == nullptr || size == 0) { + return index; } - for(auto entry: *root->indices()) { - index.indices.try_emplace(entry->source(), entry->index()); + std::span bytes(static_cast(data), size); + auto result = kfb::from_flatbuffer(bytes, index); + if(!result) { + return ProjectIndex(); } - - for(auto entry: *root->symbols()) { - auto& symbol = index.symbols[entry->symbol_id()]; - auto* fb_symbol = entry->symbol(); - if(auto* name = fb_symbol->name()) { - symbol.name = name->str(); - } - symbol.kind = SymbolKind(static_cast(fb_symbol->kind())); - symbol.reference_files = read_bitmap(fb_symbol->refs()); - } - return index; } diff --git a/src/index/project_index.h b/src/index/project_index.h index 12bbebe01..5191af08b 100644 --- a/src/index/project_index.h +++ b/src/index/project_index.h @@ -2,10 +2,14 @@ #include #include +#include #include #include "index/tu_index.h" +#include "kota/codec/arena/traits.h" +#include "kota/codec/detail/fwd.h" +#include "kota/support/expected_try.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" @@ -84,7 +88,71 @@ struct ProjectIndex { void serialize(this ProjectIndex& self, llvm::raw_ostream& os); - static ProjectIndex from(const void* data); + static ProjectIndex from(const void* data, std::size_t size); }; } // namespace clice::index + +namespace kota::codec { + +/// `PathPool` on the wire is a flat list of absolute paths; `id` is the +/// position in the vector. The allocator and reverse cache are runtime-only. +/// +/// Streaming serialize: iterate `pool.paths` and allocate strings directly +/// into the builder, avoiding the double-copy that a value-mode +/// `wire_type = std::vector` conversion would introduce. +template + requires arena::arena_serializer_like +struct serialize_traits { + // Structural wire shape — declared so the flatbuffers proxy views + // a `PathPool` field as an `array_view`. + using wire_type = std::vector; + + static auto serialize(S& s, const clice::index::PathPool& pool) + -> std::expected { + std::vector offsets; + offsets.reserve(pool.paths.size()); + for(const auto& path: pool.paths) { + auto r = s.alloc_string(std::string_view(path.data(), path.size())); + if(!r) { + return std::unexpected(r.error()); + } + offsets.push_back(*r); + } + return s.alloc_string_vector( + std::span(offsets.data(), offsets.size())); + } +}; + +/// Streaming deserialize: read each path out of the flatbuffer's +/// string-vector view directly, interning it into the pool's allocator +/// in-place. Avoids the transient `std::vector` the +/// value-mode form would materialize. +template + requires arena::arena_deserializer_like +struct deserialize_traits { + using wire_type = std::vector; + + static auto deserialize(const D& d, + typename D::TableView view, + typename D::slot_id sid, + clice::index::PathPool& out) + -> std::expected { + if(!view.has(sid)) { + return {}; + } + KOTA_EXPECTED_TRY_V(auto vec, d.get_string_vector(view, sid)); + out.paths.resize(vec.size()); + for(std::size_t i = 0; i < vec.size(); ++i) { + auto sv = vec[i]; + llvm::SmallString<256> normalized(llvm::StringRef(sv.data(), sv.size())); + std::replace(normalized.begin(), normalized.end(), '\\', '/'); + auto interned = out.save(normalized.str()); + out.paths[i] = interned; + out.cache.try_emplace(interned, static_cast(i)); + } + return {}; + } +}; + +} // namespace kota::codec diff --git a/src/index/schema.fbs b/src/index/schema.fbs deleted file mode 100644 index e25e1f293..000000000 --- a/src/index/schema.fbs +++ /dev/null @@ -1,173 +0,0 @@ -namespace clice.index.binary; - -struct Range { - begin : uint; - end : uint; -} - -struct Occurrence { - range : Range; - target : ulong; -} - -struct Relation { - kind : uint; - padding : uint; - range : Range; - target_symbol : ulong; -} - -table CacheEntry { -sha256: - string; -canonical_id: - uint; -} - -struct IncludeContext { - include_id : uint; - canonical_id : uint; -} - -table HeaderContextEntry { -path_id: - uint; -version: - uint; -includes: - [IncludeContext]; -} - -struct IncludeLocation { - path_id : uint; - line : uint; - include_id : uint; -} - -table CompilationContextEntry { -path_id: - uint; -version: - uint; -canonical_id: - uint; -build_at: - ulong; -include_locations: - [IncludeLocation]; -} - -table OccurrenceEntry { -occurrence: - Occurrence; -context: - [ubyte]; -} - -table RelationEntry { -relation: - Relation; -context: - [ubyte]; -} - -table SymbolRelationsEntry { -symbol: - ulong; -relations: - [RelationEntry]; -} - -table Symbol { -name: - string; -kind: - ubyte; -refs: - [ubyte]; -} - -table SymbolEntry { -symbol_id: - ulong; -symbol: - Symbol; -} - -table MergedIndex { -max_canonical_id: - uint; - -canonical_cache: - [CacheEntry]; - -header_contexts: - [HeaderContextEntry]; - -compilation_contexts: - [CompilationContextEntry]; - -occurrences: - [OccurrenceEntry]; - -relations: - [SymbolRelationsEntry]; - -removed: - [ubyte]; - -content: - string; -} - -table TUFileRelationsEntry { -symbol: - ulong; -relations: - [Relation]; -} - -table TUFileIndexEntry { -file_id: - uint; -occurrences: - [Occurrence]; -relations: - [TUFileRelationsEntry]; -} - -table TUIndex { -built_at: - ulong; -paths: - [string]; -locations: - [IncludeLocation]; -symbols: - [SymbolEntry]; -file_indices: - [TUFileIndexEntry]; -main_file_index: - TUFileIndexEntry; -} - -table PathEntry { -path: - string; -id: - uint; -} - -struct PathMapEntry { - source : uint; - index : uint; -} - -table ProjectIndex { -paths: - [PathEntry]; -indices: - [PathMapEntry]; -symbols: - [SymbolEntry]; -} diff --git a/src/index/serialization.h b/src/index/serialization.h deleted file mode 100644 index cac3cc3f3..000000000 --- a/src/index/serialization.h +++ /dev/null @@ -1,79 +0,0 @@ -#include -#include -#include - -#include "schema_generated.h" -#include "support/bitmap.h" - -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" - -namespace clice::index { - -namespace fbs = flatbuffers; - -namespace { - -template -concept sequence_range = std::ranges::input_range && - !requires { typename Range::key_type; } && requires(const Range& r) { - r.data(); - r.size(); - }; - -template -using Offsets = llvm::SmallVector, 0>; - -template -const U* safe_cast(const V* v) { - static_assert(sizeof(U) == sizeof(V), "size mismatch"); - static_assert(alignof(U) == alignof(V), "alignment mismatch"); - static_assert(std::is_trivially_copyable_v && std::is_trivially_copyable_v, - "requires trivially copyable"); - /// If aliasing issues arise, prefer copying into a temporary SmallVector. - return reinterpret_cast(v); -} - -auto CreateString(fbs::FlatBufferBuilder& builder, llvm::StringRef string) { - return builder.CreateString(string.data(), string.size()); -} - -template -auto CreateVector(fbs::FlatBufferBuilder& builder, const Range& range) { - return builder.CreateVector(range.data(), range.size()); -} - -auto CreateVector(fbs::FlatBufferBuilder& builder, const llvm::SmallVector& range) { - return builder.CreateVector(reinterpret_cast(range.data()), range.size()); -} - -template -auto CreateStructVector(fbs::FlatBufferBuilder& builder, const Range& range) { - using V = std::ranges::range_value_t; - (void)sizeof(V); - return builder.CreateVectorOfStructs(safe_cast(range.data()), range.size()); -} - -template -auto transform(const Range& range, const Functor& functor) { - using V = std::ranges::range_value_t; - using R = std::invoke_result_t; - - llvm::SmallVector result; - result.resize_for_overwrite(std::ranges::size(range)); - - auto i = 0; - for(auto&& v: range) { - result[i] = functor(v); - i += 1; - } - return result; -} - -Bitmap read_bitmap(const fbs::Vector* buffer) { - return Bitmap::read(reinterpret_cast(buffer->data()), false); -} - -} // namespace - -} // namespace clice::index diff --git a/src/index/tu_index.cpp b/src/index/tu_index.cpp index 4ba6b1d3c..c76fb9326 100644 --- a/src/index/tu_index.cpp +++ b/src/index/tu_index.cpp @@ -1,17 +1,24 @@ #include "index/tu_index.h" +#include +#include +#include #include -#include "index/serialization.h" +#include "index/kotatsu_adapters.h" // type_adapter specializations #include "semantic/ast_utility.h" #include "semantic/semantic_visitor.h" +#include "kota/codec/flatbuffers/deserializer.h" +#include "kota/codec/flatbuffers/serializer.h" #include "llvm/Support/SHA256.h" namespace clice::index { namespace { +namespace kfb = kota::codec::flatbuffers; + class Builder : public SemanticVisitor { public: Builder(TUIndex& result, CompilationUnitRef unit, bool interested_only) : @@ -114,6 +121,8 @@ class Builder : public SemanticVisitor { void build() { run(); + auto interested = unit.interested_file(); + for(auto& [fid, index]: result.file_indices) { for(auto& [symbol_id, relations]: index.relations) { std::ranges::sort(relations, [](const Relation& lhs, const Relation& rhs) { @@ -144,13 +153,19 @@ class Builder : public SemanticVisitor { return lhs.range == rhs.range && lhs.target == rhs.target; }); index.occurrences.erase(range.begin(), range.end()); + } - if(fid == unit.interested_file()) { - result.main_file_index = std::move(index); + // Populate main_file_index (interested file) and path_file_indices + // (keyed by path_id) for serialization. `file_indices` itself is + // `skip`-marked (runtime-only, keyed by clang::FileID) and retained + // for in-memory consumers/tests that need FileID access. + for(auto& [fid, index]: result.file_indices) { + if(fid == interested) { + result.main_file_index = index; + } else { + result.path_file_indices[result.graph.path_id(fid)] = index; } } - - result.file_indices.erase(unit.interested_file()); } private: @@ -198,119 +213,23 @@ TUIndex TUIndex::build(CompilationUnitRef unit, bool interested_only) { return index; } -void TUIndex::serialize(llvm::raw_ostream& os) const { - fbs::FlatBufferBuilder builder(4096); - - llvm::SmallVector buffer; - - auto paths = - transform(graph.paths, [&](const std::string& p) { return builder.CreateString(p); }); - - auto syms = transform(symbols, [&](auto&& value) { - auto& [symbol_id, symbol] = value; - buffer.clear(); - buffer.resize_for_overwrite(symbol.reference_files.getSizeInBytes(false)); - symbol.reference_files.write(buffer.data(), false); - return binary::CreateSymbolEntry(builder, - symbol_id, - binary::CreateSymbol(builder, - CreateString(builder, symbol.name), - symbol.kind.value(), - CreateVector(builder, buffer))); - }); - - /// Serialize a single FileIndex into a TUFileIndexEntry. - auto serialize_file_index = [&](std::uint32_t fid, const FileIndex& index) { - auto occs = CreateStructVector(builder, index.occurrences); - auto rels = transform(index.relations, [&](auto&& value) { - auto& [symbol_id, relations] = value; - return binary::CreateTUFileRelationsEntry( - builder, - symbol_id, - CreateStructVector(builder, relations)); - }); - return binary::CreateTUFileIndexEntry(builder, fid, occs, CreateVector(builder, rels)); - }; - - /// Convert FileID-keyed file_indices to path_id-keyed entries. - llvm::SmallVector> file_idx_vec; - for(auto& [fid, index]: file_indices) { - auto pid = graph.path_id(fid); - file_idx_vec.push_back(serialize_file_index(pid, index)); - } - - /// Main file is the last path in graph.paths (convention from IncludeGraph). - auto main_idx = - serialize_file_index(static_cast(graph.paths.size() - 1), main_file_index); - - auto tu_index = - binary::CreateTUIndex(builder, - static_cast(built_at.count()), - CreateVector(builder, paths), - CreateStructVector(builder, graph.locations), - CreateVector(builder, syms), - builder.CreateVector(file_idx_vec.data(), file_idx_vec.size()), - main_idx); - - builder.Finish(tu_index); - os.write(safe_cast(builder.GetBufferPointer()), builder.GetSize()); +void TUIndex::serialize(llvm::raw_ostream& os) { + auto bytes = kfb::to_flatbuffer(*this); + assert(bytes && "TUIndex flatbuffer serialization failed"); + os.write(reinterpret_cast(bytes->data()), bytes->size()); } -TUIndex TUIndex::from(const void* data) { - auto root = fbs::GetRoot(data); - +TUIndex TUIndex::from(const void* data, std::size_t size) { TUIndex index; - index.built_at = std::chrono::milliseconds(root->built_at()); - - for(auto p: *root->paths()) { - index.graph.paths.emplace_back(p->str()); - } - - for(auto loc: *root->locations()) { - index.graph.locations.emplace_back(*safe_cast(loc)); + if(data == nullptr || size == 0) { + return index; } - for(auto entry: *root->symbols()) { - auto& symbol = index.symbols[entry->symbol_id()]; - symbol.name = entry->symbol()->name()->str(); - symbol.kind = SymbolKind(static_cast(entry->symbol()->kind())); - symbol.reference_files = read_bitmap(entry->symbol()->refs()); + std::span bytes(static_cast(data), size); + auto result = kfb::from_flatbuffer(bytes, index); + if(!result) { + return TUIndex(); } - - /// Helper to deserialize a TUFileIndexEntry into a FileIndex. - auto deserialize_file_index = [](const binary::TUFileIndexEntry* entry) -> FileIndex { - FileIndex fi; - if(entry->occurrences()) { - fi.occurrences.reserve(entry->occurrences()->size()); - for(auto o: *entry->occurrences()) { - fi.occurrences.emplace_back(*safe_cast(o)); - } - } - if(entry->relations()) { - for(auto rel_entry: *entry->relations()) { - auto& rels = fi.relations[rel_entry->symbol()]; - if(rel_entry->relations()) { - rels.reserve(rel_entry->relations()->size()); - for(auto r: *rel_entry->relations()) { - rels.emplace_back(*safe_cast(r)); - } - } - } - } - return fi; - }; - - /// Populate path_file_indices keyed by path_id (no clang::FileID needed). - if(root->file_indices()) { - for(auto entry: *root->file_indices()) { - index.path_file_indices[entry->file_id()] = deserialize_file_index(entry); - } - } - - if(root->main_file_index()) { - index.main_file_index = deserialize_file_index(root->main_file_index()); - } - return index; } diff --git a/src/index/tu_index.h b/src/index/tu_index.h index 65e905009..1c4bab32d 100644 --- a/src/index/tu_index.h +++ b/src/index/tu_index.h @@ -12,6 +12,7 @@ #include "semantic/symbol_kind.h" #include "support/bitmap.h" +#include "kota/meta/annotation.h" #include "llvm/Support/raw_ostream.h" namespace clice::index { @@ -35,6 +36,10 @@ struct Relation { constexpr auto definition_range() { return std::bit_cast(target_symbol); } + + friend bool operator==(const Relation&, const Relation&) = default; + + friend auto operator<=>(const Relation&, const Relation&) = default; }; struct Occurrence { @@ -45,6 +50,8 @@ struct Occurrence { SymbolHash target; friend bool operator==(const Occurrence&, const Occurrence&) = default; + + friend auto operator<=>(const Occurrence&, const Occurrence&) = default; }; struct FileIndex { @@ -77,19 +84,21 @@ struct TUIndex { SymbolTable symbols; - llvm::DenseMap file_indices; + /// Runtime-only: keyed by AST-scoped `clang::FileID` during build; flushed + /// into `path_file_indices` (keyed by path id) before serialization. + kota::meta::skip> file_indices; - /// File indices keyed by path_id, populated by from() for deserialized data. - /// When built from AST, this is empty and file_indices (keyed by FileID) is used. + /// File indices keyed by path_id. Populated from `file_indices` at + /// serialize time, and directly from the wire on deserialize. llvm::DenseMap path_file_indices; FileIndex main_file_index; static TUIndex build(CompilationUnitRef unit, bool interested_only = false); - void serialize(llvm::raw_ostream& os) const; + void serialize(llvm::raw_ostream& os); - static TUIndex from(const void* data); + static TUIndex from(const void* data, std::size_t size); }; } // namespace clice::index diff --git a/src/semantic/relation_kind.h b/src/semantic/relation_kind.h index 9916a0ca4..9b4f7f136 100644 --- a/src/semantic/relation_kind.h +++ b/src/semantic/relation_kind.h @@ -71,6 +71,10 @@ constexpr bool operator==(RelationKind lhs, RelationKind rhs) { return lhs.value() == rhs.value(); } +constexpr auto operator<=>(RelationKind lhs, RelationKind rhs) { + return lhs.value() <=> rhs.value(); +} + constexpr bool operator&(RelationKind lhs, RelationKind rhs) { return lhs.value() == rhs.value(); } diff --git a/src/server/compiler.cpp b/src/server/compiler.cpp index 349dfeb8d..512f582d0 100644 --- a/src/server/compiler.cpp +++ b/src/server/compiler.cpp @@ -763,7 +763,8 @@ kota::task Compiler::ensure_compiled(Session& session) { // Store open file index from the stateful worker's TUIndex. if(!result.value().tu_index_data.empty()) { - auto tu_index = index::TUIndex::from(result.value().tu_index_data.data()); + auto tu_index = index::TUIndex::from(result.value().tu_index_data.data(), + result.value().tu_index_data.size()); OpenFileIndex ofi; ofi.file_index = std::move(tu_index.main_file_index); ofi.symbols = std::move(tu_index.symbols); diff --git a/src/server/indexer.cpp b/src/server/indexer.cpp index a192e06ba..093a82a2d 100644 --- a/src/server/indexer.cpp +++ b/src/server/indexer.cpp @@ -25,7 +25,7 @@ namespace clice { namespace lsp = kota::ipc::lsp; void Indexer::merge(const void* tu_index_data, std::size_t size) { - auto tu_index = index::TUIndex::from(tu_index_data); + auto tu_index = index::TUIndex::from(tu_index_data, size); if(tu_index.graph.paths.empty()) { LOG_WARN("Ignoring TUIndex with empty path graph"); return; @@ -144,7 +144,8 @@ void Indexer::load(llvm::StringRef index_dir) { auto project_path = path::join(index_dir, "project.idx"); auto buf = llvm::MemoryBuffer::getFile(project_path); if(buf) { - workspace.project_index = index::ProjectIndex::from((*buf)->getBufferStart()); + workspace.project_index = + index::ProjectIndex::from((*buf)->getBufferStart(), (*buf)->getBufferSize()); LOG_INFO("Loaded ProjectIndex: {} symbols", workspace.project_index.symbols.size()); } diff --git a/src/syntax/token.h b/src/syntax/token.h index ad844fadc..fba41674f 100644 --- a/src/syntax/token.h +++ b/src/syntax/token.h @@ -46,6 +46,8 @@ struct LocalSourceRange { constexpr bool operator==(const LocalSourceRange& other) const = default; + constexpr auto operator<=>(const LocalSourceRange& other) const = default; + constexpr std::uint32_t length() const { return end - begin; } diff --git a/tests/unit/index/project_index_tests.cpp b/tests/unit/index/project_index_tests.cpp index eb9f36663..afe699bc0 100644 --- a/tests/unit/index/project_index_tests.cpp +++ b/tests/unit/index/project_index_tests.cpp @@ -128,7 +128,7 @@ TEST_CASE(SerializationRoundTrip) { project.serialize(os); // Deserialize. - auto restored = index::ProjectIndex::from(buf.data()); + auto restored = index::ProjectIndex::from(buf.data(), buf.size()); // Path pools should match. ASSERT_EQ(project.path_pool.paths.size(), restored.path_pool.paths.size()); @@ -190,7 +190,7 @@ TEST_CASE(NameSurvivesRoundTrip) { llvm::SmallString<4096> buf; llvm::raw_svector_ostream os(buf); project.serialize(os); - auto restored = index::ProjectIndex::from(buf.data()); + auto restored = index::ProjectIndex::from(buf.data(), buf.size()); // Verify names survive round-trip. for(auto& [hash, symbol]: project.symbols) {