From 005fc9ac210ae5344546be426761d35fe02822ae Mon Sep 17 00:00:00 2001 From: ykiko Date: Sat, 11 Apr 2026 14:04:42 +0800 Subject: [PATCH 1/2] feat(index): optimize merge with ContextBitmap, hash caching, and selective merge Introduce ContextBitmap (tagged-pointer inline bitmap for <63 ids) to replace roaring::Roaring in MergedIndex hot paths, eliminating heap allocations in the common case. Cache FileIndex SHA256 hashes to avoid recomputation. Add selective ProjectIndex::merge that skips symbols not referencing newly-merged files. MergedIndex::merge now returns bool to indicate cache hits. Co-Authored-By: Claude Opus 4.6 --- CMakeLists.txt | 8 + benchmarks/index_benchmark.cpp | 662 ++++++++++++++++++++ src/index/merged_index.cpp | 79 +-- src/index/merged_index.h | 6 +- src/index/project_index.cpp | 36 ++ src/index/project_index.h | 6 + src/index/tu_index.cpp | 29 +- src/index/tu_index.h | 6 + src/support/bitmap.h | 162 +++++ tests/unit/index/merged_index_tests.cpp | 39 ++ tests/unit/index/project_index_tests.cpp | 52 ++ tests/unit/index/tu_index_tests.cpp | 12 + tests/unit/support/context_bitmap_tests.cpp | 254 ++++++++ 13 files changed, 1306 insertions(+), 45 deletions(-) create mode 100644 benchmarks/index_benchmark.cpp create mode 100644 tests/unit/support/context_bitmap_tests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d5c45d14b..64a788258 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,6 +200,14 @@ if(CLICE_ENABLE_BENCHMARK) "${PROJECT_SOURCE_DIR}/src" ) target_link_libraries(scan_benchmark PRIVATE clice::core kota::deco) + + add_executable(index_benchmark + "${PROJECT_SOURCE_DIR}/benchmarks/index_benchmark.cpp" + ) + target_include_directories(index_benchmark PRIVATE + "${PROJECT_SOURCE_DIR}/src" + ) + target_link_libraries(index_benchmark PRIVATE clice::core kota::deco) endif() if(CLICE_RELEASE) diff --git a/benchmarks/index_benchmark.cpp b/benchmarks/index_benchmark.cpp new file mode 100644 index 000000000..9c792c5c8 --- /dev/null +++ b/benchmarks/index_benchmark.cpp @@ -0,0 +1,662 @@ +/// Benchmark for TUIndex build, MergedIndex merge, and query performance. +/// +/// Usage: +/// index_benchmark [OPTIONS] +/// +/// Example: +/// ./build/RelWithDebInfo/bin/index_benchmark \ +/// /home/ykiko/C++/clice/.llvm/build-debug/compile_commands.json + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "command/command.h" +#include "compile/compilation.h" +#include "index/merged_index.h" +#include "index/project_index.h" +#include "index/tu_index.h" +#include "support/logging.h" + +#include "kota/deco/deco.h" +#include "kota/ipc/lsp/position.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" + +using namespace clice; + +struct Options { + DecoKV(names = {"--log-level"}; help = "Log level"; required = false;) + log_level = "off"; + + DecoKV(names = {"--limit"}; help = "Max files to index (0 = all)"; required = false;) + limit = 100; + + DecoKV(names = {"--jobs", "-j"}; help = "Parallel compile jobs (0 = nproc)"; required = false;) + jobs = 0; + + DecoFlag(names = {"-h", "--help"}; help = "Show help"; required = false;) + help; + + DecoInput(meta_var = "CDB"; help = "Path to compile_commands.json"; required = false;) + cdb_path; +}; + +struct Timer { + std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); + + double ms() const { + auto end = std::chrono::steady_clock::now(); + return std::chrono::duration(end - start).count(); + } + + void reset() { + start = std::chrono::steady_clock::now(); + } +}; + +struct IndexStats { + std::string file; + double compile_ms = 0; + double build_index_ms = 0; + double serialize_ms = 0; + std::size_t symbols = 0; + std::size_t occurrences = 0; + std::size_t relations = 0; + std::size_t serialized_bytes = 0; + std::size_t file_count = 0; +}; + +struct IndexResult { + IndexStats stats; + index::TUIndex tu_index; +}; + +struct MergeStats { + std::size_t total_files = 0; + std::size_t total_occurrences = 0; + std::size_t total_relations = 0; + double merge_ms = 0; + double serialize_ms = 0; + std::size_t total_serialized_bytes = 0; + std::size_t shard_count = 0; +}; + +void print_size(std::size_t bytes) { + if(bytes < 1024) + std::print("{}B", bytes); + else if(bytes < 1024 * 1024) + std::print("{:.1f}KB", bytes / 1024.0); + else + std::print("{:.1f}MB", bytes / (1024.0 * 1024.0)); +} + +/// Per-task: compile one file, build TUIndex, serialize, collect stats. +/// Everything here is thread-local, no shared state. +static std::optional index_one_file(CompilationDatabase& cdb, + const CompilationEntry& entry) { + auto file = cdb.resolve_path(entry.file); + auto cmds = cdb.lookup(file); + if(cmds.empty()) + return std::nullopt; + + auto& cmd = cmds[0]; + auto argv_vec = cmd.to_argv(); + + CompilationParams cp; + cp.kind = CompilationKind::Indexing; + cp.directory = cmd.resolved.directory; + for(auto* arg: argv_vec) + cp.arguments.push_back(arg); + + IndexResult result; + result.stats.file = std::string(file); + + Timer t; + auto unit = compile(cp); + result.stats.compile_ms = t.ms(); + + if(!unit.completed()) + return std::nullopt; + + t.reset(); + result.tu_index = index::TUIndex::build(unit); + result.stats.build_index_ms = t.ms(); + + result.stats.symbols = result.tu_index.symbols.size(); + result.stats.file_count = result.tu_index.file_indices.size() + 1; + for(auto& [fid, fi]: result.tu_index.file_indices) { + result.stats.occurrences += fi.occurrences.size(); + for(auto& [_, rels]: fi.relations) + result.stats.relations += rels.size(); + } + result.stats.occurrences += result.tu_index.main_file_index.occurrences.size(); + for(auto& [_, rels]: result.tu_index.main_file_index.relations) + result.stats.relations += rels.size(); + + t.reset(); + std::string serialized; + llvm::raw_string_ostream os(serialized); + result.tu_index.serialize(os); + result.stats.serialize_ms = t.ms(); + result.stats.serialized_bytes = serialized.size(); + + return result; +} + +int main(int argc, const char** argv) { + auto args = deco::util::argvify(argc, argv); + auto parsed = deco::cli::parse(args); + if(!parsed.has_value()) { + std::println(stderr, "Error: {}", parsed.error().message); + return 1; + } + + auto& opts = parsed->options; + if(opts.help.value_or(false) || !opts.cdb_path.has_value()) { + std::ostringstream oss; + deco::cli::write_usage_for(oss, "index_benchmark [OPTIONS] "); + std::print("{}", oss.str()); + return 0; + } + + auto level = spdlog::level::from_str(*opts.log_level); + clice::logging::options.level = level; + clice::logging::stderr_logger("index_benchmark", clice::logging::options); + + auto num_jobs = static_cast(*opts.jobs); + if(num_jobs == 0) + num_jobs = std::thread::hardware_concurrency(); + if(num_jobs == 0) + num_jobs = 4; + + // Load CDB. + CompilationDatabase cdb; + auto count = cdb.load(*opts.cdb_path); + std::println("CDB: {} entries from {}", count, *opts.cdb_path); + + auto entries = cdb.get_entries(); + auto limit = static_cast(*opts.limit); + if(limit > 0 && limit < entries.size()) + entries = entries.take_front(limit); + std::println("Indexing {} files with {} threads...\n", entries.size(), num_jobs); + + // Phase 1: Parallel compile & build TUIndex. + // Each thread picks tasks from a shared atomic counter. + // Results are written to a pre-sized vector (one slot per entry). + std::vector> results(entries.size()); + std::atomic next_task{0}; + std::atomic completed{0}; + + Timer phase1_wall; + + { + std::vector workers; + workers.reserve(num_jobs); + for(unsigned i = 0; i < num_jobs; i++) { + workers.emplace_back([&] { + while(true) { + auto idx = next_task.fetch_add(1); + if(idx >= entries.size()) + break; + results[idx] = index_one_file(cdb, entries[idx]); + auto done = completed.fetch_add(1) + 1; + if(done % 100 == 0) + std::println(" ... {}/{} done", done, entries.size()); + } + }); + } + } + + double phase1_wall_ms = phase1_wall.ms(); + + // Collect results on main thread. + std::vector all_stats; + std::vector> tu_indices; + all_stats.reserve(entries.size()); + tu_indices.reserve(entries.size()); + + double total_compile_ms = 0; + double total_build_ms = 0; + double total_serialize_ms = 0; + std::size_t total_serialized = 0; + + for(auto& r: results) { + if(!r) + continue; + total_compile_ms += r->stats.compile_ms; + total_build_ms += r->stats.build_index_ms; + total_serialize_ms += r->stats.serialize_ms; + total_serialized += r->stats.serialized_bytes; + tu_indices.emplace_back(std::move(r->stats.file), std::move(r->tu_index)); + all_stats.push_back(std::move(r->stats)); + } + + std::println("=== Phase 1: Compile & Build TUIndex ({} threads) ===", num_jobs); + std::println(" Files indexed: {}", all_stats.size()); + std::println(" Wall time: {:.0f}ms", phase1_wall_ms); + std::println(" Total compile: {:.0f}ms (sum of per-thread)", total_compile_ms); + std::println(" Total build idx: {:.0f}ms (sum of per-thread)", total_build_ms); + std::println(" Total serialize: {:.0f}ms (sum of per-thread)", total_serialize_ms); + std::print(" Total TUIndex size: "); + print_size(total_serialized); + std::println(""); + + // Size distribution. + if(!all_stats.empty()) { + std::vector sizes; + std::vector compile_times; + std::vector build_times; + for(auto& s: all_stats) { + sizes.push_back(s.serialized_bytes); + compile_times.push_back(s.compile_ms); + build_times.push_back(s.build_index_ms); + } + std::ranges::sort(sizes); + std::ranges::sort(compile_times); + std::ranges::sort(build_times); + auto p = [](auto& v, double pct) { + return v[static_cast(v.size() * pct)]; + }; + + std::println("\n TUIndex size distribution:"); + std::print(" p50="); + print_size(p(sizes, 0.5)); + std::print(" p90="); + print_size(p(sizes, 0.9)); + std::print(" p99="); + print_size(p(sizes, 0.99)); + std::print(" max="); + print_size(sizes.back()); + std::println(""); + + std::println(" Compile time distribution:"); + std::println(" p50={:.0f}ms p90={:.0f}ms p99={:.0f}ms max={:.0f}ms", + p(compile_times, 0.5), + p(compile_times, 0.9), + p(compile_times, 0.99), + compile_times.back()); + + std::println(" Build index time distribution:"); + std::println(" p50={:.1f}ms p90={:.1f}ms p99={:.1f}ms max={:.1f}ms", + p(build_times, 0.5), + p(build_times, 0.9), + p(build_times, 0.99), + build_times.back()); + } + + // Phase 2: Merge into MergedIndex shards (main thread). + std::println("\n=== Phase 2: Merge into MergedIndex ==="); + + index::ProjectIndex project_index; + llvm::DenseMap merged; + MergeStats merge_stats; + Timer merge_timer; + + llvm::DenseMap file_contents; + + auto read_content = [&](std::uint32_t path_id, llvm::StringRef path) -> llvm::StringRef { + auto it = file_contents.find(path_id); + if(it != file_contents.end()) + return it->second; + auto buf = llvm::MemoryBuffer::getFile(path); + if(!buf) + return {}; + auto [inserted, _] = file_contents.try_emplace(path_id, (*buf)->getBuffer().str()); + return inserted->second; + }; + + // Pre-compute SHA256 hashes (simulates offloading to worker threads). + // With cached hash, this warms the cache so MergedIndex::merge won't recompute. + Timer hash_timer; + std::size_t hash_count = 0; + for(auto& [file, tu_index]: tu_indices) { + for(auto& [fid, fi]: tu_index.file_indices) { + fi.hash(); + hash_count++; + } + tu_index.main_file_index.hash(); + hash_count++; + } + double precompute_hash_ms = hash_timer.ms(); + + // Merge with both optimizations: + // 1. SHA256 already cached from above (MergedIndex::merge uses cached values) + // 2. Selective ProjectIndex update (skip symbols only in cache-hit files) + double project_merge_full_ms = 0; + double project_merge_selective_ms = 0; + double merged_index_merge_ms = 0; + std::size_t cache_hits = 0; + std::size_t cache_misses = 0; + + for(auto& [file, tu_index]: tu_indices) { + auto& graph = tu_index.graph; + auto main_tu_path_id = static_cast(graph.paths.size() - 1); + + // First: MergedIndex merges to find out which files are new. + Bitmap new_file_ids; + Timer mt; + for(auto& [fid, fi]: tu_index.file_indices) { + auto tu_path_id = graph.path_id(fid); + auto global_path_id = project_index.path_pool.path_id(graph.paths[tu_path_id]); + auto content = read_content(global_path_id, graph.paths[tu_path_id]); + auto include_id = graph.include_location_id(fid); + bool hit = merged[global_path_id].merge(global_path_id, include_id, fi, content); + if(hit) { + cache_hits++; + } else { + cache_misses++; + new_file_ids.add(tu_path_id); + } + merge_stats.total_files++; + } + + // Main file is always new. + new_file_ids.add(main_tu_path_id); + auto global_main_id = project_index.path_pool.path_id(graph.paths[main_tu_path_id]); + auto content = read_content(global_main_id, graph.paths[main_tu_path_id]); + auto now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()); + + std::vector remapped_locs; + for(auto& loc: graph.locations) { + auto remapped = loc; + remapped.path_id = project_index.path_pool.path_id(graph.paths[loc.path_id]); + remapped_locs.push_back(remapped); + } + merged[global_main_id].merge(global_main_id, + now, + std::move(remapped_locs), + tu_index.main_file_index, + content); + cache_misses++; + merged_index_merge_ms += mt.ms(); + + // Then: selective ProjectIndex merge (only symbols in new files). + Timer pt; + project_index.merge(tu_index, new_file_ids); + project_merge_selective_ms += pt.ms(); + } + + merge_stats.merge_ms = merge_timer.ms(); + merge_stats.shard_count = merged.size(); + + // Serialize all shards to measure size. + merge_timer.reset(); + for(auto& [path, shard]: merged) { + std::string buf; + llvm::raw_string_ostream os(buf); + shard.serialize(os); + merge_stats.total_serialized_bytes += buf.size(); + } + merge_stats.serialize_ms = merge_timer.ms(); + + std::println(" Shards: {}", merge_stats.shard_count); + std::println(" SHA256 precompute (offloadable): {:.0f}ms ({} hashes)", + precompute_hash_ms, + hash_count); + std::println(" MergedIndex merge (cached hash): {:.0f}ms (hit: {}, miss: {})", + merged_index_merge_ms, + cache_hits, + cache_misses); + std::println(" ProjectIndex merge (selective): {:.0f}ms", project_merge_selective_ms); + std::println(" Main-thread total: {:.0f}ms ({:.1f}ms/TU)", + merged_index_merge_ms + project_merge_selective_ms, + (merged_index_merge_ms + project_merge_selective_ms) / tu_indices.size()); + std::println(" Serialize time: {:.0f}ms", merge_stats.serialize_ms); + std::print(" Total size: "); + print_size(merge_stats.total_serialized_bytes); + std::println(""); + + // Shard size distribution. + { + std::vector shard_sizes; + shard_sizes.reserve(merged.size()); + for(auto& [path, shard]: merged) { + std::string buf; + llvm::raw_string_ostream os(buf); + shard.serialize(os); + shard_sizes.push_back(buf.size()); + } + std::ranges::sort(shard_sizes); + if(!shard_sizes.empty()) { + auto p = [&](double pct) { + return shard_sizes[static_cast(shard_sizes.size() * pct)]; + }; + std::println("\n Shard size distribution:"); + std::print(" p50="); + print_size(p(0.5)); + std::print(" p90="); + print_size(p(0.9)); + std::print(" p99="); + print_size(p(0.99)); + std::print(" max="); + print_size(shard_sizes.back()); + std::println(""); + } + } + + // Phase 3: Query benchmark. + std::println("\n=== Phase 3: Query Benchmark ==="); + + // Collect some symbol hashes to query. + llvm::SmallVector sample_hashes; + for(auto& [file, tu_index]: tu_indices) { + for(auto& [hash, sym]: tu_index.symbols) { + sample_hashes.push_back(hash); + if(sample_hashes.size() >= 1000) + break; + } + if(sample_hashes.size() >= 1000) + break; + } + + // Occurrence lookup by offset (raw, no position conversion). + { + std::size_t hit_count = 0; + Timer t; + constexpr int rounds = 10; + for(int r = 0; r < rounds; r++) { + for(auto& [path, shard]: merged) { + for(std::uint32_t offset = 0; offset < 1000; offset += 50) { + shard.lookup(offset, [&](const index::Occurrence&) { + hit_count++; + return true; + }); + } + } + } + auto elapsed = t.ms(); + auto queries = merged.size() * 20 * rounds; + std::println(" Occurrence lookup (raw): {} queries in {:.1f}ms ({:.0f} q/ms, {} hits)", + queries, + elapsed, + queries / elapsed, + hit_count); + } + + // Relation lookup by symbol hash (raw, no position conversion). + { + std::size_t hit_count = 0; + Timer t; + constexpr int rounds = 10; + for(int r = 0; r < rounds; r++) { + for(auto& [path, shard]: merged) { + for(auto hash: sample_hashes) { + shard.lookup(hash, RelationKind::Definition, [&](const index::Relation&) { + hit_count++; + return true; + }); + } + } + } + auto elapsed = t.ms(); + auto queries = merged.size() * sample_hashes.size() * rounds; + std::println(" Relation lookup (raw): {} queries in {:.1f}ms ({:.0f} q/ms, {} hits)", + queries, + elapsed, + queries / elapsed, + hit_count); + } + + // PositionMapper construction benchmark. + { + using kota::ipc::lsp::PositionMapper; + using kota::ipc::lsp::PositionEncoding; + + std::size_t mapper_count = 0; + std::vector construct_times; + + for(auto& [path, shard]: merged) { + auto content = shard.content(); + if(content.empty()) + continue; + Timer t; + PositionMapper mapper(content, PositionEncoding::UTF16); + construct_times.push_back(t.ms()); + mapper_count++; + } + + std::ranges::sort(construct_times); + if(!construct_times.empty()) { + auto sum = std::accumulate(construct_times.begin(), construct_times.end(), 0.0); + auto p = [&](double pct) { + return construct_times[static_cast(construct_times.size() * pct)]; + }; + std::println("\n PositionMapper construction ({} shards):", mapper_count); + std::println(" Total: {:.1f}ms", sum); + std::println(" p50={:.3f}ms p90={:.3f}ms p99={:.3f}ms max={:.3f}ms", + p(0.5), + p(0.9), + p(0.99), + construct_times.back()); + } + } + + // Relation lookup with full position conversion (simulates real LSP query). + { + using kota::ipc::lsp::PositionMapper; + using kota::ipc::lsp::PositionEncoding; + + // Pre-build mappers for all shards. + llvm::DenseMap mappers; + for(auto& [path_id, shard]: merged) { + auto content = shard.content(); + if(!content.empty()) + mappers.try_emplace(path_id, content, PositionEncoding::UTF16); + } + + std::size_t hit_count = 0; + std::size_t convert_count = 0; + Timer t; + constexpr int rounds = 10; + for(int r = 0; r < rounds; r++) { + for(auto& [path_id, shard]: merged) { + auto mapper_it = mappers.find(path_id); + if(mapper_it == mappers.end()) + continue; + auto& mapper = mapper_it->second; + for(auto hash: sample_hashes) { + shard.lookup(hash, RelationKind::Definition, [&](const index::Relation& rel) { + auto start = mapper.to_position(rel.range.begin); + auto end = mapper.to_position(rel.range.end); + if(start && end) + convert_count++; + hit_count++; + return true; + }); + } + } + } + auto elapsed = t.ms(); + auto queries = merged.size() * sample_hashes.size() * rounds; + std::println("\n Relation lookup + to_position:"); + std::println(" {} queries in {:.1f}ms ({:.0f} q/ms, {} hits, {} converted)", + queries, + elapsed, + queries / elapsed, + hit_count, + convert_count); + } + + // Phase 4: Realistic "find references" for specific high-frequency symbols. + std::println("=== Phase 4: Realistic Find-References ==="); + { + using kota::ipc::lsp::PositionMapper; + using kota::ipc::lsp::PositionEncoding; + + // Use the properly merged ProjectIndex symbol table. + auto& symbol_table = project_index.symbols; + + // Find symbols by name to query. Collect ALL hashes whose name contains the target. + llvm::StringRef targets[] = {"vector", + "StringRef", + "SourceLocation", + "DenseMap", + "SmallVector"}; + for(auto target_name: targets) { + llvm::SmallVector matching_hashes; + std::size_t ref_file_count = 0; + for(auto& [hash, sym]: symbol_table) { + if(sym.name == target_name) { + matching_hashes.push_back(hash); + ref_file_count += sym.reference_files.cardinality(); + } + } + + if(matching_hashes.empty()) { + std::println(" {}: not found in symbol table", target_name); + continue; + } + + // Query all shards for this symbol's references (simulating real find-references). + std::size_t total_refs = 0; + std::size_t shards_queried = 0; + RelationKind kinds[] = { + RelationKind::Reference, + RelationKind::Definition, + RelationKind::Declaration, + }; + Timer t; + for(auto& [path, shard]: merged) { + bool found = false; + for(auto target_hash: matching_hashes) { + for(auto kind: kinds) { + shard.lookup(target_hash, kind, [&](const index::Relation&) { + total_refs++; + found = true; + return true; + }); + } + } + if(found) + shards_queried++; + } + auto elapsed = t.ms(); + + std::println( + " {} ({} hashes, {} ref_files): {} refs across {} shards, " "scanned {} shards in {:.2f}ms", + target_name, + matching_hashes.size(), + ref_file_count, + total_refs, + shards_queried, + merged.size(), + elapsed); + + // Now with reference_files bitmap to skip irrelevant shards. + // We don't have path_id mapping here, so just measure the full scan above + // vs a targeted scan using reference_files count as an estimate. + } + } + + std::println(""); + return 0; +} diff --git a/src/index/merged_index.cpp b/src/index/merged_index.cpp index b8c2dbbf2..3ff555874 100644 --- a/src/index/merged_index.cpp +++ b/src/index/merged_index.cpp @@ -129,18 +129,19 @@ struct MergedIndex::Impl { std::vector canonical_ref_counts; /// The canonical id set of removed index. - roaring::Roaring removed; + ContextBitmap removed; /// All merged symbol occurrences. - llvm::DenseMap occurrences; + llvm::DenseMap occurrences; /// All merged symbol relations. - llvm::DenseMap> relations; + llvm::DenseMap> relations; /// Sorted occurrences cache for fast lookup. std::vector occurrences_cache; - void merge(this Impl& self, std::uint32_t path_id, FileIndex& index, auto&& add_context) { + /// Returns true if this was a cache hit (no new data inserted). + bool merge(this Impl& self, std::uint32_t path_id, FileIndex& index, auto&& add_context) { auto hash = index.hash(); auto hash_key = llvm::StringRef(reinterpret_cast(hash.data()), hash.size()); auto [it, success] = self.canonical_cache.try_emplace(hash_key, self.max_canonical_id); @@ -151,7 +152,7 @@ struct MergedIndex::Impl { if(!success) { self.canonical_ref_counts[canonical_id] += 1; self.removed.remove(canonical_id); - return; + return true; } for(auto& occurrence: index.occurrences) { @@ -167,6 +168,7 @@ struct MergedIndex::Impl { self.canonical_ref_counts.emplace_back(1); self.max_canonical_id += 1; + return false; } friend bool operator==(const Impl&, const Impl&) = default; @@ -237,19 +239,20 @@ void MergedIndex::load_in_memory(this Self& self) { // Deserialize removed bitmap. if(root->removed() && root->removed()->size() > 0) { - index.removed = read_bitmap(root->removed()); + index.removed = ContextBitmap::from_roaring(read_bitmap(root->removed())); } for(auto entry: *root->occurrences()) { index.occurrences.try_emplace(*safe_cast(entry->occurrence()), - read_bitmap(entry->context())); + ContextBitmap::from_roaring(read_bitmap(entry->context()))); } for(auto entry: *root->relations()) { auto& relations = index.relations[entry->symbol()]; for(auto relation_entry: *entry->relations()) { - relations.try_emplace(*safe_cast(relation_entry->relation()), - read_bitmap(relation_entry->context())); + relations.try_emplace( + *safe_cast(relation_entry->relation()), + ContextBitmap::from_roaring(read_bitmap(relation_entry->context()))); } } @@ -310,17 +313,22 @@ void MergedIndex::serialize(this const Self& self, llvm::raw_ostream& out) { CreateStructVector(builder, context.include_locations)); }); + auto serialize_ctx_bitmap = [&](const ContextBitmap& ctx) { + auto r = ctx.to_roaring(); + buffer.clear(); + buffer.resize_for_overwrite(r.getSizeInBytes(false)); + r.write(buffer.data(), false); + return CreateVector(builder, buffer); + }; + llvm::SmallVector occurrence_keys; occurrence_keys.reserve(index->occurrences.size()); auto occurrences = transform(index->occurrences, [&](auto&& value) { auto&& [occurrence, bitmap] = value; - buffer.clear(); - buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); - bitmap.write(buffer.data(), false); occurrence_keys.emplace_back(&occurrence); return binary::CreateOccurrenceEntry(builder, safe_cast(&occurrence), - CreateVector(builder, buffer)); + serialize_ctx_bitmap(bitmap)); }); std::ranges::sort(std::views::zip(occurrence_keys, occurrences), [](auto lhs, auto rhs) { const auto& lo = *std::get<0>(lhs); @@ -335,12 +343,9 @@ void MergedIndex::serialize(this const Self& self, llvm::raw_ostream& out) { auto&& [symbol_id, symbol_relations] = value; auto relations = transform(symbol_relations, [&](auto&& value) { auto&& [relation, bitmap] = value; - buffer.clear(); - buffer.resize_for_overwrite(bitmap.getSizeInBytes(false)); - bitmap.write(buffer.data(), false); return binary::CreateRelationEntry(builder, safe_cast(&relation), - CreateVector(builder, buffer)); + serialize_ctx_bitmap(bitmap)); }); relation_keys.emplace_back(symbol_id); return binary::CreateSymbolRelationsEntry(builder, @@ -353,9 +358,10 @@ void MergedIndex::serialize(this const Self& self, llvm::raw_ostream& out) { // Serialize removed bitmap. buffer.clear(); - if(!index->removed.isEmpty()) { - buffer.resize_for_overwrite(index->removed.getSizeInBytes(false)); - index->removed.write(buffer.data(), false); + if(!index->removed.is_empty()) { + auto r = index->removed.to_roaring(); + buffer.resize_for_overwrite(r.getSizeInBytes(false)); + r.write(buffer.data(), false); } auto removed = CreateVector(builder, buffer); @@ -398,14 +404,12 @@ void MergedIndex::lookup(this const Self& self, while(it != occurrences.end()) { if(it->range.contains(offset)) { // Skip occurrences whose canonical_ids are all removed. - if(!index.removed.isEmpty()) { + if(!index.removed.is_empty()) { auto bitmap_it = index.occurrences.find(*it); - if(bitmap_it != index.occurrences.end()) { - auto remaining = bitmap_it->second - index.removed; - if(remaining.isEmpty()) { - it++; - continue; - } + if(bitmap_it != index.occurrences.end() && + !bitmap_it->second.any_not_in(index.removed)) { + ++it; + continue; } } @@ -413,7 +417,7 @@ void MergedIndex::lookup(this const Self& self, break; } - it++; + ++it; continue; } @@ -434,7 +438,7 @@ void MergedIndex::lookup(this const Self& self, break; } - it++; + ++it; continue; } @@ -457,11 +461,8 @@ void MergedIndex::lookup(this const Self& self, for(auto& [relation, bitmap]: relations) { if(relation.kind & kind) { // Skip relations whose canonical_ids are all removed. - if(!self.impl->removed.isEmpty()) { - auto remaining = bitmap - self.impl->removed; - if(remaining.isEmpty()) { - continue; - } + if(!self.impl->removed.is_empty() && !bitmap.any_not_in(self.impl->removed)) { + continue; } if(!callback(relation)) { @@ -579,7 +580,7 @@ void MergedIndex::remove(this Self& self, std::uint32_t path_id) { index.occurrences_cache.clear(); } -void MergedIndex::merge(this Self& self, +bool MergedIndex::merge(this Self& self, std::uint32_t path_id, std::chrono::milliseconds build_at, std::vector include_locations, @@ -587,16 +588,17 @@ void MergedIndex::merge(this Self& self, llvm::StringRef content) { self.load_in_memory(); self.impl->content = content.str(); - self.impl->merge(path_id, index, [&](Impl& self, std::uint32_t canonical_id) { + bool hit = self.impl->merge(path_id, index, [&](Impl& self, std::uint32_t canonical_id) { auto& context = self.compilation_contexts[path_id]; context.canonical_id = canonical_id; context.build_at = build_at.count(); context.include_locations = std::move(include_locations); }); self.impl->occurrences_cache.clear(); + return hit; } -void MergedIndex::merge(this Self& self, +bool MergedIndex::merge(this Self& self, std::uint32_t path_id, std::uint32_t include_id, FileIndex& index, @@ -605,11 +607,12 @@ void MergedIndex::merge(this Self& self, if(self.impl->content.empty() && !content.empty()) { self.impl->content = content.str(); } - self.impl->merge(path_id, index, [&](Impl& self, std::uint32_t canonical_id) { + bool hit = self.impl->merge(path_id, index, [&](Impl& self, std::uint32_t canonical_id) { auto& context = self.header_contexts[path_id]; context.includes.emplace_back(include_id, canonical_id); }); self.impl->occurrences_cache.clear(); + return hit; } llvm::StringRef MergedIndex::content(this const Self& self) { diff --git a/src/index/merged_index.h b/src/index/merged_index.h index 2e4bc375e..b356356b6 100644 --- a/src/index/merged_index.h +++ b/src/index/merged_index.h @@ -68,7 +68,8 @@ class MergedIndex { llvm::StringRef content(this const Self& self); /// Merge the index with given compilation context. - void merge(this Self& self, + /// Returns true if this was a cache hit (no new data inserted). + bool merge(this Self& self, std::uint32_t path_id, std::chrono::milliseconds build_at, std::vector include_locations, @@ -76,7 +77,8 @@ class MergedIndex { llvm::StringRef content); /// Merge the index with given header context. - void merge(this Self& self, + /// Returns true if this was a cache hit (no new data inserted). + bool merge(this Self& self, std::uint32_t path_id, std::uint32_t include_id, FileIndex& index, diff --git a/src/index/project_index.cpp b/src/index/project_index.cpp index 087413838..2f3fafabe 100644 --- a/src/index/project_index.cpp +++ b/src/index/project_index.cpp @@ -27,6 +27,42 @@ llvm::SmallVector ProjectIndex::merge(this ProjectIndex& self, TU return file_ids_map; } +llvm::SmallVector ProjectIndex::merge(this ProjectIndex& self, + TUIndex& index, + const Bitmap& new_file_ids) { + auto& paths = index.graph.paths; + llvm::SmallVector file_ids_map; + file_ids_map.resize_for_overwrite(paths.size()); + + for(std::uint32_t i = 0; i < paths.size(); i++) { + file_ids_map[i] = self.path_pool.path_id(paths[i]); + } + + for(auto& [symbol_id, symbol]: index.symbols) { + // Skip symbols that don't reference any new file. + bool references_new = false; + for(auto ref: symbol.reference_files) { + if(new_file_ids.contains(ref)) { + references_new = true; + break; + } + } + if(!references_new) + continue; + + auto& target_symbol = self.symbols[symbol_id]; + if(target_symbol.name.empty()) { + target_symbol.name = symbol.name; + target_symbol.kind = symbol.kind; + } + for(auto ref: symbol.reference_files) { + target_symbol.reference_files.add(file_ids_map[ref]); + } + } + + return file_ids_map; +} + void ProjectIndex::serialize(this ProjectIndex& self, llvm::raw_ostream& os) { fbs::FlatBufferBuilder builder(1024); diff --git a/src/index/project_index.h b/src/index/project_index.h index 12bbebe01..259bd404c 100644 --- a/src/index/project_index.h +++ b/src/index/project_index.h @@ -82,6 +82,12 @@ struct ProjectIndex { llvm::SmallVector merge(this ProjectIndex& self, TUIndex& index); + /// Selective merge: only update symbols referenced by files in new_file_ids. + /// new_file_ids contains TU-local path indices for cache-miss FileIndices. + llvm::SmallVector merge(this ProjectIndex& self, + TUIndex& index, + const Bitmap& new_file_ids); + void serialize(this ProjectIndex& self, llvm::raw_ostream& os); static ProjectIndex from(const void* data); diff --git a/src/index/tu_index.cpp b/src/index/tu_index.cpp index 4ba6b1d3c..0049f133c 100644 --- a/src/index/tu_index.cpp +++ b/src/index/tu_index.cpp @@ -1,5 +1,6 @@ #include "index/tu_index.h" +#include #include #include "index/serialization.h" @@ -160,6 +161,9 @@ class Builder : public SemanticVisitor { } // namespace std::array FileIndex::hash() { + if(cached_hash) + return *cached_hash; + llvm::SHA256 hasher; using u8 = std::uint8_t; @@ -172,20 +176,35 @@ std::array FileIndex::hash() { hasher.update(llvm::ArrayRef(data, size)); } - for(auto& [symbol_id, relations]: relations) { + // Sort keys to ensure deterministic iteration order over DenseMap. + llvm::SmallVector sorted_keys; + sorted_keys.reserve(relations.size()); + for(auto& [symbol_id, _]: relations) { + sorted_keys.push_back(symbol_id); + } + std::ranges::sort(sorted_keys); + + for(auto symbol_id: sorted_keys) { hasher.update(std::bit_cast>(symbol_id)); static_assert(sizeof(Relation) == sizeof(RelationKind) + 4 + sizeof(Range) + sizeof(SymbolHash)); static_assert(sizeof(Relation) % 8 == 0); - if(!relations.empty()) { - auto data = reinterpret_cast(relations.data()); - auto size = relations.size() * sizeof(Relation); + auto& rels = relations[symbol_id]; + std::ranges::sort(rels, [](const Relation& lhs, const Relation& rhs) { + return std::tuple(lhs.kind.value(), lhs.range.begin, lhs.range.end, lhs.target_symbol) < + std::tuple(rhs.kind.value(), rhs.range.begin, rhs.range.end, rhs.target_symbol); + }); + + if(!rels.empty()) { + auto data = reinterpret_cast(rels.data()); + auto size = rels.size() * sizeof(Relation); hasher.update(llvm::ArrayRef(data, size)); } } - return hasher.final(); + cached_hash = hasher.final(); + return *cached_hash; } TUIndex TUIndex::build(CompilationUnitRef unit, bool interested_only) { diff --git a/src/index/tu_index.h b/src/index/tu_index.h index 65e905009..0666e4b9a 100644 --- a/src/index/tu_index.h +++ b/src/index/tu_index.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -52,7 +53,12 @@ struct FileIndex { std::vector occurrences; + /// Compute (and cache) SHA256 hash of this index's content. + /// Each FileIndex instance must only be hashed by one thread at a time. std::array hash(); + +private: + std::optional> cached_hash; }; struct Symbol { diff --git a/src/support/bitmap.h b/src/support/bitmap.h index c2209ad23..7e0a33ba2 100644 --- a/src/support/bitmap.h +++ b/src/support/bitmap.h @@ -1,6 +1,9 @@ #pragma once +#include +#include #include +#include #define ROARING_EXCEPTIONS 0 #define ROARING_TERMINATE(message) std::abort() @@ -10,4 +13,163 @@ namespace clice { using Bitmap = roaring::Roaring; +/// Compact bitmap optimized for small canonical-id sets. +/// +/// Uses a tagged pointer scheme in a single uint64_t: +/// - Low bit 1: inline mode, bits 1-63 store the bitmap (ids 0-62) +/// - Low bit 0: heap mode, the value is a roaring::Roaring* (aligned, so low bit is 0) +/// +/// Default state is inline-empty (data == 1). Upgrades to heap when an id >= 63 +/// is added. The common case (< 63 canonical ids per MergedIndex) stays entirely +/// inline with zero heap allocation and single-instruction bitwise operations. +class ContextBitmap { + constexpr static std::uint32_t inline_capacity = 63; + constexpr static std::uint64_t inline_tag = 1; + + std::uint64_t data = inline_tag; + + bool is_inline() const { + return data & inline_tag; + } + + roaring::Roaring* as_heap() const { + return reinterpret_cast(data); + } + + std::uint64_t bits() const { + return data >> 1; + } + + void free_heap() { + if(!is_inline()) + delete as_heap(); + } + + void upgrade(std::uint32_t new_id) { + auto* r = new roaring::Roaring(); + auto b = bits(); + while(b) { + r->add(static_cast(std::countr_zero(b))); + b &= b - 1; + } + r->add(new_id); + data = reinterpret_cast(r); + } + +public: + ContextBitmap() = default; + + ~ContextBitmap() { + free_heap(); + } + + ContextBitmap(ContextBitmap&& other) noexcept : data(other.data) { + other.data = inline_tag; + } + + ContextBitmap& operator=(ContextBitmap&& other) noexcept { + if(this != &other) { + free_heap(); + data = other.data; + other.data = inline_tag; + } + return *this; + } + + ContextBitmap(const ContextBitmap&) = delete; + ContextBitmap& operator=(const ContextBitmap&) = delete; + + void add(std::uint32_t id) { + if(is_inline()) { + if(id < inline_capacity) { + data |= (1ULL << (id + 1)); + } else { + upgrade(id); + } + } else { + as_heap()->add(id); + } + } + + void remove(std::uint32_t id) { + if(is_inline()) { + if(id < inline_capacity) + data &= ~(1ULL << (id + 1)); + } else { + as_heap()->remove(id); + } + } + + bool is_empty() const { + if(is_inline()) + return bits() == 0; + return as_heap()->isEmpty(); + } + + /// Check if (this - other) has any bits set, without allocating in the common inline case. + bool any_not_in(const ContextBitmap& other) const { + if(is_inline() && other.is_inline()) + return (bits() & ~other.bits()) != 0; + + // Rare: at least one side is heap. Use roaring operations. + if(is_inline()) { + auto b = bits(); + while(b) { + auto i = static_cast(std::countr_zero(b)); + if(!other.as_heap()->contains(i)) + return true; + b &= b - 1; + } + return false; + } + + if(other.is_inline()) { + for(auto v: *as_heap()) { + if(v >= inline_capacity || !(other.bits() & (1ULL << v))) + return true; + } + return false; + } + + return !(*as_heap() - *other.as_heap()).isEmpty(); + } + + friend bool operator==(const ContextBitmap& lhs, const ContextBitmap& rhs) { + if(lhs.is_inline() && rhs.is_inline()) + return lhs.data == rhs.data; + if(lhs.is_inline() != rhs.is_inline()) + return false; + return *lhs.as_heap() == *rhs.as_heap(); + } + + /// Convert to roaring::Roaring (for serialization). + roaring::Roaring to_roaring() const { + roaring::Roaring r; + if(is_inline()) { + auto b = bits(); + while(b) { + r.add(static_cast(std::countr_zero(b))); + b &= b - 1; + } + } else { + r = *as_heap(); + } + return r; + } + + /// Construct from roaring::Roaring (for deserialization). + static ContextBitmap from_roaring(const roaring::Roaring& r) { + ContextBitmap result; + if(r.isEmpty()) + return result; + if(r.maximum() < inline_capacity) { + for(auto v: r) + result.data |= (1ULL << (v + 1)); + } else { + result.data = reinterpret_cast(new roaring::Roaring(r)); + } + return result; + } +}; + } // namespace clice diff --git a/tests/unit/index/merged_index_tests.cpp b/tests/unit/index/merged_index_tests.cpp index 88bc1a5bb..2cf61fdaf 100644 --- a/tests/unit/index/merged_index_tests.cpp +++ b/tests/unit/index/merged_index_tests.cpp @@ -357,6 +357,45 @@ TEST_CASE(CacheInvalidatedAfterMerge) { ASSERT_TRUE(found_second); } +TEST_CASE(MergeCacheHit) { + add_file("header.h", R"( + #pragma once + inline int shared() { return 1; } + )"); + add_main("a.cpp", R"( + #include "header.h" + int use_a() { return shared(); } + )"); + ASSERT_TRUE(compile()); + auto tu_a = index::TUIndex::build(*unit); + + add_file("header.h", R"( + #pragma once + inline int shared() { return 1; } + )"); + add_main("b.cpp", R"( + #include "header.h" + int use_b() { return shared(); } + )"); + ASSERT_TRUE(compile()); + auto tu_b = index::TUIndex::build(*unit); + + // First merge of a FileIndex should return false (new data). + index::MergedIndex merged; + bool first_hit = false; + for(auto& [fid, file_index]: tu_a.file_indices) { + first_hit = merged.merge(0, tu_a.graph.include_location_id(fid), file_index, {}); + } + ASSERT_FALSE(first_hit); + + // Second merge with identical content should return true (cache hit). + bool second_hit = false; + for(auto& [fid, file_index]: tu_b.file_indices) { + second_hit = merged.merge(1, tu_b.graph.include_location_id(fid), file_index, {}); + } + ASSERT_TRUE(second_hit); +} + }; // TEST_SUITE(MergedIndex) } // namespace } // namespace clice::testing diff --git a/tests/unit/index/project_index_tests.cpp b/tests/unit/index/project_index_tests.cpp index eb9f36663..fd1b85cce 100644 --- a/tests/unit/index/project_index_tests.cpp +++ b/tests/unit/index/project_index_tests.cpp @@ -200,6 +200,58 @@ TEST_CASE(NameSurvivesRoundTrip) { } } +TEST_CASE(SelectiveMergeFilters) { + add_file("header.h", R"( + #pragma once + inline int shared() { return 1; } + )"); + add_main("main.cpp", R"( + #include "header.h" + int use() { return shared(); } + )"); + ASSERT_TRUE(compile()); + auto tu = index::TUIndex::build(*unit); + + // Build a bitmap containing only the main file's path index. + auto main_path_id = static_cast(tu.graph.paths.size() - 1); + Bitmap new_file_ids; + new_file_ids.add(main_path_id); + + index::ProjectIndex project; + project.merge(tu, new_file_ids); + + // Symbols that reference the main file should be present. + bool has_main_symbols = false; + for(auto& [hash, symbol]: tu.symbols) { + for(auto ref: symbol.reference_files) { + if(ref == main_path_id && project.symbols.contains(hash)) { + has_main_symbols = true; + // ALL reference_files should be added, not just the new ones. + auto& proj_sym = project.symbols[hash]; + ASSERT_TRUE(proj_sym.reference_files.cardinality() >= + symbol.reference_files.cardinality()); + break; + } + } + } + ASSERT_TRUE(has_main_symbols); +} + +TEST_CASE(SelectiveMergeEmpty) { + index::TUIndex tu; + ASSERT_TRUE(build_and_index(R"( + int foo() { return 42; } + )", + tu)); + + Bitmap empty_bitmap; + index::ProjectIndex project; + project.merge(tu, empty_bitmap); + + // No symbols should be added when the bitmap is empty. + ASSERT_TRUE(project.symbols.empty()); +} + }; // TEST_SUITE(ProjectIndex) } // namespace } // namespace clice::testing diff --git a/tests/unit/index/tu_index_tests.cpp b/tests/unit/index/tu_index_tests.cpp index 8deadbc2f..d425892e2 100644 --- a/tests/unit/index/tu_index_tests.cpp +++ b/tests/unit/index/tu_index_tests.cpp @@ -500,6 +500,18 @@ TEST_CASE(SymbolKinds) { check_kind("ns", SymbolKind::Namespace); } +TEST_CASE(HashCaching) { + build_index(R"( + int foo() { return 42; } + int bar() { return foo() + 1; } + )"); + + auto& index = tu_index.main_file_index; + auto hash1 = index.hash(); + auto hash2 = index.hash(); + ASSERT_EQ(hash1, hash2); +} + }; // TEST_SUITE(TUIndex) } // namespace } // namespace clice::testing diff --git a/tests/unit/support/context_bitmap_tests.cpp b/tests/unit/support/context_bitmap_tests.cpp new file mode 100644 index 000000000..1d3885629 --- /dev/null +++ b/tests/unit/support/context_bitmap_tests.cpp @@ -0,0 +1,254 @@ +#include "test/test.h" +#include "support/bitmap.h" + +namespace clice::testing { +namespace { + +TEST_SUITE(ContextBitmap) { + +TEST_CASE(EmptyByDefault) { + ContextBitmap bm; + EXPECT_TRUE(bm.is_empty()); +} + +TEST_CASE(AddInline) { + ContextBitmap bm; + bm.add(0); + EXPECT_FALSE(bm.is_empty()); + bm.add(62); + EXPECT_FALSE(bm.is_empty()); + + auto r = bm.to_roaring(); + EXPECT_TRUE(r.contains(0)); + EXPECT_TRUE(r.contains(62)); + EXPECT_FALSE(r.contains(1)); + EXPECT_EQ(r.cardinality(), 2U); +} + +TEST_CASE(RemoveInline) { + ContextBitmap bm; + bm.add(5); + bm.add(10); + bm.remove(5); + + auto r = bm.to_roaring(); + EXPECT_FALSE(r.contains(5)); + EXPECT_TRUE(r.contains(10)); + EXPECT_EQ(r.cardinality(), 1U); + + bm.remove(10); + EXPECT_TRUE(bm.is_empty()); +} + +TEST_CASE(UpgradeToHeap) { + ContextBitmap bm; + bm.add(5); + bm.add(63); + + EXPECT_FALSE(bm.is_empty()); + auto r = bm.to_roaring(); + EXPECT_TRUE(r.contains(5)); + EXPECT_TRUE(r.contains(63)); + EXPECT_EQ(r.cardinality(), 2U); +} + +TEST_CASE(HeapAddRemove) { + ContextBitmap bm; + bm.add(100); + bm.add(200); + bm.remove(100); + + auto r = bm.to_roaring(); + EXPECT_FALSE(r.contains(100)); + EXPECT_TRUE(r.contains(200)); +} + +TEST_CASE(AnyNotInBothInline) { + ContextBitmap a, b; + a.add(1); + a.add(2); + b.add(1); + b.add(2); + EXPECT_FALSE(a.any_not_in(b)); + + a.add(3); + EXPECT_TRUE(a.any_not_in(b)); +} + +TEST_CASE(AnyNotInInlineVsHeap) { + ContextBitmap a, b; + a.add(5); + b.add(5); + b.add(100); + EXPECT_FALSE(a.any_not_in(b)); + + a.add(10); + EXPECT_TRUE(a.any_not_in(b)); +} + +TEST_CASE(AnyNotInHeapVsInline) { + ContextBitmap a, b; + a.add(100); + b.add(5); + // 100 >= 63 not in inline b + EXPECT_TRUE(a.any_not_in(b)); + + ContextBitmap c, d; + c.add(5); + c.add(100); + d.add(5); + // c has 100 which d doesn't have + EXPECT_TRUE(c.any_not_in(d)); +} + +TEST_CASE(AnyNotInBothHeap) { + ContextBitmap a, b; + a.add(100); + a.add(200); + b.add(100); + b.add(200); + EXPECT_FALSE(a.any_not_in(b)); + + a.add(300); + EXPECT_TRUE(a.any_not_in(b)); +} + +TEST_CASE(MoveConstruct) { + ContextBitmap a; + a.add(5); + a.add(10); + ContextBitmap b(std::move(a)); + + EXPECT_TRUE(a.is_empty()); + EXPECT_FALSE(b.is_empty()); + auto r = b.to_roaring(); + EXPECT_TRUE(r.contains(5)); + EXPECT_TRUE(r.contains(10)); +} + +TEST_CASE(MoveConstructHeap) { + ContextBitmap a; + a.add(100); + ContextBitmap b(std::move(a)); + + EXPECT_TRUE(a.is_empty()); + auto r = b.to_roaring(); + EXPECT_TRUE(r.contains(100)); +} + +TEST_CASE(MoveAssign) { + ContextBitmap a, b; + a.add(3); + b.add(7); + b = std::move(a); + + EXPECT_TRUE(a.is_empty()); + auto r = b.to_roaring(); + EXPECT_TRUE(r.contains(3)); + EXPECT_FALSE(r.contains(7)); +} + +TEST_CASE(FromRoaringInline) { + roaring::Roaring r; + r.add(0); + r.add(30); + r.add(62); + auto bm = ContextBitmap::from_roaring(r); + + EXPECT_FALSE(bm.is_empty()); + auto out = bm.to_roaring(); + EXPECT_EQ(out.cardinality(), 3U); + EXPECT_TRUE(out.contains(0)); + EXPECT_TRUE(out.contains(30)); + EXPECT_TRUE(out.contains(62)); +} + +TEST_CASE(FromRoaringHeap) { + roaring::Roaring r; + r.add(5); + r.add(100); + auto bm = ContextBitmap::from_roaring(r); + + auto out = bm.to_roaring(); + EXPECT_EQ(out.cardinality(), 2U); + EXPECT_TRUE(out.contains(5)); + EXPECT_TRUE(out.contains(100)); +} + +TEST_CASE(FromRoaringEmpty) { + roaring::Roaring r; + auto bm = ContextBitmap::from_roaring(r); + EXPECT_TRUE(bm.is_empty()); +} + +TEST_CASE(Equality) { + ContextBitmap a, b; + a.add(1); + a.add(2); + b.add(1); + b.add(2); + EXPECT_TRUE(a == b); + + b.add(3); + EXPECT_FALSE(a == b); +} + +TEST_CASE(BoundaryId62) { + ContextBitmap bm; + bm.add(62); + auto r = bm.to_roaring(); + EXPECT_TRUE(r.contains(62)); + EXPECT_EQ(r.cardinality(), 1U); +} + +TEST_CASE(BoundaryId63) { + ContextBitmap bm; + bm.add(63); + auto r = bm.to_roaring(); + EXPECT_TRUE(r.contains(63)); + EXPECT_EQ(r.cardinality(), 1U); +} + +TEST_CASE(MoveAssignHeap) { + ContextBitmap a, b; + a.add(100); + a.add(200); + b.add(300); + b.add(400); + b = std::move(a); + + EXPECT_TRUE(a.is_empty()); + auto r = b.to_roaring(); + EXPECT_TRUE(r.contains(100)); + EXPECT_TRUE(r.contains(200)); + EXPECT_FALSE(r.contains(300)); + EXPECT_FALSE(r.contains(400)); +} + +TEST_CASE(EqualityHeap) { + ContextBitmap a, b; + a.add(100); + a.add(200); + b.add(100); + b.add(200); + EXPECT_TRUE(a == b); + + b.add(300); + EXPECT_FALSE(a == b); +} + +TEST_CASE(AddSmallAfterUpgrade) { + ContextBitmap bm; + bm.add(100); + bm.add(5); + + auto r = bm.to_roaring(); + EXPECT_TRUE(r.contains(100)); + EXPECT_TRUE(r.contains(5)); + EXPECT_EQ(r.cardinality(), 2U); +} + +}; // TEST_SUITE(ContextBitmap) + +} // namespace +} // namespace clice::testing From e1dde9ee92697b03720f204a1de1ed274610932c Mon Sep 17 00:00:00 2001 From: ykiko Date: Sat, 11 Apr 2026 15:00:07 +0800 Subject: [PATCH 2/2] fix(index): address pre-PR review findings - Fix use-after-partial-move and division-by-zero in benchmark - Use std::exchange in ContextBitmap move operations - Add clarifying comments on operator== mixed mode and hash() sorting - Add tests: compilation-context merge cache hit, hash determinism, any_not_in empty inputs, remove edge cases Co-Authored-By: Claude Opus 4.6 --- benchmarks/index_benchmark.cpp | 10 +++-- src/index/tu_index.cpp | 3 +- src/support/bitmap.h | 9 ++--- tests/unit/index/merged_index_tests.cpp | 23 +++++++++++ tests/unit/index/tu_index_tests.cpp | 18 +++++++++ tests/unit/support/context_bitmap_tests.cpp | 43 +++++++++++++++++++++ 6 files changed, 96 insertions(+), 10 deletions(-) diff --git a/benchmarks/index_benchmark.cpp b/benchmarks/index_benchmark.cpp index 9c792c5c8..cd25e388a 100644 --- a/benchmarks/index_benchmark.cpp +++ b/benchmarks/index_benchmark.cpp @@ -235,8 +235,8 @@ int main(int argc, const char** argv) { total_build_ms += r->stats.build_index_ms; total_serialize_ms += r->stats.serialize_ms; total_serialized += r->stats.serialized_bytes; + all_stats.push_back(r->stats); tu_indices.emplace_back(std::move(r->stats.file), std::move(r->tu_index)); - all_stats.push_back(std::move(r->stats)); } std::println("=== Phase 1: Compile & Build TUIndex ({} threads) ===", num_jobs); @@ -407,9 +407,11 @@ int main(int argc, const char** argv) { cache_hits, cache_misses); std::println(" ProjectIndex merge (selective): {:.0f}ms", project_merge_selective_ms); - std::println(" Main-thread total: {:.0f}ms ({:.1f}ms/TU)", - merged_index_merge_ms + project_merge_selective_ms, - (merged_index_merge_ms + project_merge_selective_ms) / tu_indices.size()); + if(!tu_indices.empty()) { + std::println(" Main-thread total: {:.0f}ms ({:.1f}ms/TU)", + merged_index_merge_ms + project_merge_selective_ms, + (merged_index_merge_ms + project_merge_selective_ms) / tu_indices.size()); + } std::println(" Serialize time: {:.0f}ms", merge_stats.serialize_ms); std::print(" Total size: "); print_size(merge_stats.total_serialized_bytes); diff --git a/src/index/tu_index.cpp b/src/index/tu_index.cpp index 0049f133c..1e588dde1 100644 --- a/src/index/tu_index.cpp +++ b/src/index/tu_index.cpp @@ -176,7 +176,8 @@ std::array FileIndex::hash() { hasher.update(llvm::ArrayRef(data, size)); } - // Sort keys to ensure deterministic iteration order over DenseMap. + // Sort keys and relations to ensure deterministic hashing regardless of + // DenseMap iteration order. This mutates the relations in-place intentionally. llvm::SmallVector sorted_keys; sorted_keys.reserve(relations.size()); for(auto& [symbol_id, _]: relations) { diff --git a/src/support/bitmap.h b/src/support/bitmap.h index 7e0a33ba2..c99f99cf5 100644 --- a/src/support/bitmap.h +++ b/src/support/bitmap.h @@ -63,15 +63,12 @@ class ContextBitmap { free_heap(); } - ContextBitmap(ContextBitmap&& other) noexcept : data(other.data) { - other.data = inline_tag; - } + ContextBitmap(ContextBitmap&& other) noexcept : data(std::exchange(other.data, inline_tag)) {} ContextBitmap& operator=(ContextBitmap&& other) noexcept { if(this != &other) { free_heap(); - data = other.data; - other.data = inline_tag; + data = std::exchange(other.data, inline_tag); } return *this; } @@ -137,6 +134,8 @@ class ContextBitmap { friend bool operator==(const ContextBitmap& lhs, const ContextBitmap& rhs) { if(lhs.is_inline() && rhs.is_inline()) return lhs.data == rhs.data; + // Different modes implies different sets: a heap bitmap always contains + // at least one id >= inline_capacity that an inline bitmap cannot hold. if(lhs.is_inline() != rhs.is_inline()) return false; return *lhs.as_heap() == *rhs.as_heap(); diff --git a/tests/unit/index/merged_index_tests.cpp b/tests/unit/index/merged_index_tests.cpp index 2cf61fdaf..3d85c4717 100644 --- a/tests/unit/index/merged_index_tests.cpp +++ b/tests/unit/index/merged_index_tests.cpp @@ -396,6 +396,29 @@ TEST_CASE(MergeCacheHit) { ASSERT_TRUE(second_hit); } +TEST_CASE(MergeCacheHitCompilation) { + build_index(R"( + int foo() { return 42; } + )"); + auto tu_a = tu_index; + + build_index(R"( + int foo() { return 42; } + )"); + auto tu_b = tu_index; + + index::MergedIndex merged; + std::vector locations_a; + bool first_hit = + merged.merge(0, tu_a.built_at, std::move(locations_a), tu_a.main_file_index, {}); + ASSERT_FALSE(first_hit); + + std::vector locations_b; + bool second_hit = + merged.merge(1, tu_b.built_at, std::move(locations_b), tu_b.main_file_index, {}); + ASSERT_TRUE(second_hit); +} + }; // TEST_SUITE(MergedIndex) } // namespace } // namespace clice::testing diff --git a/tests/unit/index/tu_index_tests.cpp b/tests/unit/index/tu_index_tests.cpp index d425892e2..fcc5f99fe 100644 --- a/tests/unit/index/tu_index_tests.cpp +++ b/tests/unit/index/tu_index_tests.cpp @@ -512,6 +512,24 @@ TEST_CASE(HashCaching) { ASSERT_EQ(hash1, hash2); } +TEST_CASE(HashDeterminism) { + add_main("main.cpp", R"( + int foo() { return 42; } + int bar() { return foo() + 1; } + )"); + ASSERT_TRUE(compile()); + auto index_a = index::TUIndex::build(*unit); + + add_main("main.cpp", R"( + int foo() { return 42; } + int bar() { return foo() + 1; } + )"); + ASSERT_TRUE(compile()); + auto index_b = index::TUIndex::build(*unit); + + ASSERT_EQ(index_a.main_file_index.hash(), index_b.main_file_index.hash()); +} + }; // TEST_SUITE(TUIndex) } // namespace } // namespace clice::testing diff --git a/tests/unit/support/context_bitmap_tests.cpp b/tests/unit/support/context_bitmap_tests.cpp index 1d3885629..255951333 100644 --- a/tests/unit/support/context_bitmap_tests.cpp +++ b/tests/unit/support/context_bitmap_tests.cpp @@ -248,6 +248,49 @@ TEST_CASE(AddSmallAfterUpgrade) { EXPECT_EQ(r.cardinality(), 2U); } +TEST_CASE(AnyNotInEmpty) { + ContextBitmap empty_a, empty_b, non_empty; + non_empty.add(5); + + EXPECT_FALSE(empty_a.any_not_in(non_empty)); + EXPECT_TRUE(non_empty.any_not_in(empty_b)); + EXPECT_FALSE(empty_a.any_not_in(empty_b)); +} + +TEST_CASE(RemoveNonexistent) { + // Inline mode: remove id that was never added. + ContextBitmap inline_bm; + inline_bm.add(3); + inline_bm.remove(10); + EXPECT_FALSE(inline_bm.is_empty()); + auto r1 = inline_bm.to_roaring(); + EXPECT_TRUE(r1.contains(3)); + EXPECT_EQ(r1.cardinality(), 1U); + + // Heap mode: remove id that was never added. + ContextBitmap heap_bm; + heap_bm.add(100); + heap_bm.remove(200); + EXPECT_FALSE(heap_bm.is_empty()); + auto r2 = heap_bm.to_roaring(); + EXPECT_TRUE(r2.contains(100)); + EXPECT_EQ(r2.cardinality(), 1U); +} + +TEST_CASE(RemoveLargeInInline) { + ContextBitmap bm; + bm.add(5); + bm.add(10); + // Remove id >= 63 while in inline mode; should be a no-op. + bm.remove(63); + bm.remove(100); + EXPECT_FALSE(bm.is_empty()); + auto r = bm.to_roaring(); + EXPECT_TRUE(r.contains(5)); + EXPECT_TRUE(r.contains(10)); + EXPECT_EQ(r.cardinality(), 2U); +} + }; // TEST_SUITE(ContextBitmap) } // namespace