-
Notifications
You must be signed in to change notification settings - Fork 235
feat(entity-caching-1): raw event pipeline (draft) #2827
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
SkArchon
wants to merge
18
commits into
milinda/entity-intelligence-0
Choose a base branch
from
milinda/entity-caching-1-raw-event-pipeline
base: milinda/entity-intelligence-0
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from 9 commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
774aed4
feat: router proto update for cache event pipeline
SkArchon 0d08e26
feat: export cache metrics from the router pipeline
SkArchon bb40f9a
feat: add pipeline in to the graphqlmetrics
SkArchon 6190de3
fix(entity-caching-1): align with pinned graphql-go-tools
SkArchon 51a54f6
ci(graphqlmetrics): include cacheevents proto in inlined buf generate
SkArchon c7bdb57
style(graphqlmetrics): gofmt server.go
SkArchon 25c20ba
style(router): gofmt cacheevents/exporter/graphqlmetrics test files
SkArchon 9d8e029
test(entity-caching-1): cover aggregate, exporter, builder helpers, a…
SkArchon d1cfe52
fix: review comments
SkArchon e9ea140
fix: review comments
SkArchon 031785e
fix: review comments
SkArchon f47c3c3
fix(controlplane): split gql_cache_events migration to single statement
SkArchon e7b8178
fix(graphqlmetrics): align cacheevents Append order with gql_cache_ev…
SkArchon 77b4abd
merge milinda/entity-intelligence-0 into entity-caching-1
SkArchon 8455b62
merge milinda/entity-intelligence-0 into entity-caching-1
SkArchon c4d5882
fix(router): use buffered ArenaResolveGraphQLResponse to avoid trunca…
SkArchon 0dfdaf9
Merge branch 'milinda/entity-caching-0' into milinda/entity-caching-1…
SkArchon 0e742a9
feat(router): force hash analytics keys when cache events export is e…
SkArchon File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
220 changes: 220 additions & 0 deletions
220
controlplane/clickhouse/migrations/20260427120000_create_gql_cache_events.sql
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,220 @@ | ||
| -- migrate:up | ||
|
|
||
| CREATE TABLE IF NOT EXISTS gql_cache_events_raw | ||
| ( | ||
| -- See https://github.com/PostHog/posthog/issues/10616 why ZSTD(3) is used | ||
| Timestamp DateTime64(9, 'UTC') CODEC(Delta, ZSTD(3)), | ||
|
|
||
| -- Tenant | ||
| OrganizationID LowCardinality(String) CODEC(ZSTD(3)), | ||
| FederatedGraphID LowCardinality(String) CODEC(ZSTD(3)), | ||
| RouterConfigVersion LowCardinality(String) CODEC(ZSTD(3)), | ||
|
|
||
| -- Event discriminator. Canonical lowercase string. Values: | ||
| -- 'l1_read','l2_read','l1_write','l2_write','fetch_timing', | ||
| -- 'subgraph_error','shadow_comparison','mutation','header_impact', | ||
| -- 'cache_op_error' | ||
| EventType LowCardinality(String) CODEC(ZSTD(3)), | ||
|
|
||
| -- Operation context | ||
| OperationHash LowCardinality(String) CODEC(ZSTD(3)), | ||
| OperationName LowCardinality(String) CODEC(ZSTD(3)), | ||
| OperationType LowCardinality(String) CODEC(ZSTD(3)), | ||
| ClientName LowCardinality(String) CODEC(ZSTD(3)), | ||
| ClientVersion LowCardinality(String) CODEC(ZSTD(3)), | ||
| TraceID String CODEC(ZSTD(3)), | ||
| IsShadow Bool CODEC(ZSTD(3)), | ||
|
|
||
| -- Cache identity | ||
| EntityType LowCardinality(String) CODEC(ZSTD(3)), | ||
| SubgraphID LowCardinality(String) CODEC(ZSTD(3)), | ||
| KeyHash UInt64 CODEC(ZSTD(3)), | ||
|
|
||
| -- Field-level identity (root field for entity fetches; nested fields for value-type traversal) | ||
| FieldName LowCardinality(String) CODEC(ZSTD(3)), | ||
| FieldHash UInt64 CODEC(ZSTD(3)), | ||
| FieldPath Array(LowCardinality(String)) CODEC(ZSTD(3)), | ||
| EntityCount UInt32 CODEC(ZSTD(3)), | ||
| EntityUniqueKeys UInt32 CODEC(ZSTD(3)), | ||
|
|
||
| -- Read events (l1_read, l2_read) | ||
| Verdict LowCardinality(String) CODEC(ZSTD(3)), | ||
| ByteSize UInt32 CODEC(ZSTD(3)), | ||
| CacheAgeMs UInt32 CODEC(ZSTD(3)), | ||
|
|
||
| -- Write events (l1_write, l2_write) | ||
| TTLMs UInt32 CODEC(ZSTD(3)), | ||
| WriteReason LowCardinality(String) CODEC(ZSTD(3)), | ||
| Source LowCardinality(String) CODEC(ZSTD(3)), | ||
|
|
||
| -- Fetch timing | ||
| FetchSource LowCardinality(String) CODEC(ZSTD(3)), | ||
| DurationMs Float64 CODEC(ZSTD(3)), | ||
| TTFBMs Float64 CODEC(ZSTD(3)), | ||
| ItemCount UInt32 CODEC(ZSTD(3)), | ||
| IsEntityFetch Bool CODEC(ZSTD(3)), | ||
| HttpStatusCode UInt16 CODEC(ZSTD(3)), | ||
| ResponseBytes UInt32 CODEC(ZSTD(3)), | ||
|
|
||
| -- Errors (subgraph_error, cache_op_error) | ||
| ErrorMessage String CODEC(ZSTD(3)), | ||
| ErrorCode LowCardinality(String) CODEC(ZSTD(3)), | ||
| CacheOp LowCardinality(String) CODEC(ZSTD(3)), | ||
| CacheName LowCardinality(String) CODEC(ZSTD(3)), | ||
|
|
||
| -- Shadow + mutation share these columns | ||
| ShadowIsFresh Bool CODEC(ZSTD(3)), | ||
| CachedHash UInt64 CODEC(ZSTD(3)), | ||
| FreshHash UInt64 CODEC(ZSTD(3)), | ||
| CachedBytes UInt32 CODEC(ZSTD(3)), | ||
| FreshBytes UInt32 CODEC(ZSTD(3)), | ||
| ConfiguredTTLMs UInt32 CODEC(ZSTD(3)), | ||
|
|
||
| -- Mutation | ||
| MutationRootField LowCardinality(String) CODEC(ZSTD(3)), | ||
| HadCachedValue Bool CODEC(ZSTD(3)), | ||
| IsStale Bool CODEC(ZSTD(3)), | ||
|
|
||
| -- Header impact | ||
| BaseKeyHash UInt64 CODEC(ZSTD(3)), | ||
| HeaderHash UInt64 CODEC(ZSTD(3)), | ||
| ResponseHash UInt64 CODEC(ZSTD(3)), | ||
|
|
||
| INDEX idx_op_hash OperationHash TYPE bloom_filter(0.001) GRANULARITY 1, | ||
| INDEX idx_entity EntityType TYPE bloom_filter(0.01) GRANULARITY 1, | ||
| INDEX idx_subgraph SubgraphID TYPE bloom_filter(0.01) GRANULARITY 1, | ||
| INDEX idx_key_hash KeyHash TYPE bloom_filter(0.001) GRANULARITY 1 | ||
| ) | ||
| engine = MergeTree PARTITION BY toDate(Timestamp) | ||
| ORDER BY (OrganizationID, FederatedGraphID, EventType, OperationHash, EntityType, SubgraphID, toUnixTimestamp(Timestamp)) | ||
| TTL toDateTime(Timestamp) + toIntervalDay(7) | ||
| SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1, non_replicated_deduplication_window = 1000; | ||
|
|
||
| CREATE TABLE IF NOT EXISTS gql_cache_events_5m_90d | ||
| ( | ||
| Timestamp DateTime('UTC') CODEC(Delta, ZSTD(3)), | ||
|
|
||
| OrganizationID LowCardinality(String) CODEC(ZSTD(3)), | ||
| FederatedGraphID LowCardinality(String) CODEC(ZSTD(3)), | ||
| RouterConfigVersion LowCardinality(String) CODEC(ZSTD(3)), | ||
|
|
||
| EventType LowCardinality(String) CODEC(ZSTD(3)), | ||
| OperationHash LowCardinality(String) CODEC(ZSTD(3)), | ||
| OperationName LowCardinality(String) CODEC(ZSTD(3)), | ||
| OperationType LowCardinality(String) CODEC(ZSTD(3)), | ||
| ClientName LowCardinality(String) CODEC(ZSTD(3)), | ||
| ClientVersion LowCardinality(String) CODEC(ZSTD(3)), | ||
| EntityType LowCardinality(String) CODEC(ZSTD(3)), | ||
| SubgraphID LowCardinality(String) CODEC(ZSTD(3)), | ||
| Verdict LowCardinality(String) CODEC(ZSTD(3)), | ||
| FieldName LowCardinality(String) CODEC(ZSTD(3)), | ||
| FetchSource LowCardinality(String) CODEC(ZSTD(3)), | ||
| IsShadow Bool CODEC(ZSTD(3)), | ||
|
|
||
| Events UInt64 CODEC(ZSTD(3)), | ||
| SumByteSize UInt64 CODEC(ZSTD(3)), | ||
| SumDurationMs Float64 CODEC(ZSTD(3)), | ||
| SumCacheAgeMs UInt64 CODEC(ZSTD(3)), | ||
| SumStale UInt64 CODEC(ZSTD(3)), | ||
| SumEntityCount UInt64 CODEC(ZSTD(3)) | ||
| ) | ||
| engine = SummingMergeTree PARTITION BY toDate(Timestamp) | ||
| ORDER BY (OrganizationID, FederatedGraphID, EventType, OperationHash, EntityType, SubgraphID, ClientName, ClientVersion, RouterConfigVersion, OperationName, OperationType, Verdict, FetchSource, IsShadow, toUnixTimestamp(Timestamp)) | ||
| TTL toDateTime(Timestamp) + toIntervalDay(90) | ||
| SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; | ||
|
|
||
| CREATE MATERIALIZED VIEW IF NOT EXISTS gql_cache_events_5m_90d_mv TO gql_cache_events_5m_90d AS | ||
| SELECT | ||
| toStartOfFiveMinute(Timestamp) as Timestamp, | ||
| toLowCardinality(OrganizationID) as OrganizationID, | ||
| toLowCardinality(FederatedGraphID) as FederatedGraphID, | ||
| toLowCardinality(RouterConfigVersion) as RouterConfigVersion, | ||
| toLowCardinality(EventType) as EventType, | ||
| toLowCardinality(OperationHash) as OperationHash, | ||
| toLowCardinality(OperationName) as OperationName, | ||
| toLowCardinality(OperationType) as OperationType, | ||
| toLowCardinality(ClientName) as ClientName, | ||
| toLowCardinality(ClientVersion) as ClientVersion, | ||
| toLowCardinality(EntityType) as EntityType, | ||
| toLowCardinality(SubgraphID) as SubgraphID, | ||
| toLowCardinality(Verdict) as Verdict, | ||
| toLowCardinality(FieldName) as FieldName, | ||
| toLowCardinality(FetchSource) as FetchSource, | ||
| IsShadow as IsShadow, | ||
| count() as Events, | ||
| sum(ByteSize) as SumByteSize, | ||
| sum(DurationMs) as SumDurationMs, | ||
| sum(CacheAgeMs) as SumCacheAgeMs, | ||
| sumIf(1, IsStale) as SumStale, | ||
| sum(EntityCount) as SumEntityCount | ||
| FROM gql_cache_events_raw | ||
| GROUP BY | ||
| Timestamp, | ||
| OrganizationID, | ||
| FederatedGraphID, | ||
| RouterConfigVersion, | ||
| EventType, | ||
| OperationHash, | ||
| OperationName, | ||
| OperationType, | ||
| ClientName, | ||
| ClientVersion, | ||
| EntityType, | ||
| SubgraphID, | ||
| Verdict, | ||
| FieldName, | ||
| FetchSource, | ||
| IsShadow | ||
| ORDER BY Timestamp; | ||
|
|
||
| CREATE TABLE IF NOT EXISTS gql_cache_events_1d_90d | ||
| ( | ||
| Timestamp DateTime('UTC') CODEC(Delta, ZSTD(3)), | ||
|
|
||
| OrganizationID LowCardinality(String) CODEC(ZSTD(3)), | ||
| FederatedGraphID LowCardinality(String) CODEC(ZSTD(3)), | ||
|
|
||
| EventType LowCardinality(String) CODEC(ZSTD(3)), | ||
| EntityType LowCardinality(String) CODEC(ZSTD(3)), | ||
| SubgraphID LowCardinality(String) CODEC(ZSTD(3)), | ||
| Verdict LowCardinality(String) CODEC(ZSTD(3)), | ||
|
|
||
| Events UInt64 CODEC(ZSTD(3)), | ||
| SumByteSize UInt64 CODEC(ZSTD(3)), | ||
| SumDurationMs Float64 CODEC(ZSTD(3)) | ||
| ) | ||
| engine = SummingMergeTree PARTITION BY toDate(Timestamp) | ||
| ORDER BY (OrganizationID, FederatedGraphID, EventType, EntityType, SubgraphID, Verdict, toUnixTimestamp(Timestamp)) | ||
| TTL toDateTime(Timestamp) + toIntervalDay(90) | ||
| SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; | ||
|
|
||
| CREATE MATERIALIZED VIEW IF NOT EXISTS gql_cache_events_1d_90d_mv TO gql_cache_events_1d_90d AS | ||
| SELECT | ||
| toStartOfDay(Timestamp) as Timestamp, | ||
| toLowCardinality(OrganizationID) as OrganizationID, | ||
| toLowCardinality(FederatedGraphID) as FederatedGraphID, | ||
| toLowCardinality(EventType) as EventType, | ||
| toLowCardinality(EntityType) as EntityType, | ||
| toLowCardinality(SubgraphID) as SubgraphID, | ||
| toLowCardinality(Verdict) as Verdict, | ||
| count() as Events, | ||
| sum(ByteSize) as SumByteSize, | ||
| sum(DurationMs) as SumDurationMs | ||
| FROM gql_cache_events_raw | ||
| GROUP BY | ||
| Timestamp, | ||
| OrganizationID, | ||
| FederatedGraphID, | ||
| EventType, | ||
| EntityType, | ||
| SubgraphID, | ||
| Verdict | ||
| ORDER BY Timestamp; | ||
|
|
||
| -- migrate:down | ||
|
|
||
| DROP VIEW IF EXISTS gql_cache_events_1d_90d_mv; | ||
| DROP TABLE IF EXISTS gql_cache_events_1d_90d; | ||
| DROP VIEW IF EXISTS gql_cache_events_5m_90d_mv; | ||
| DROP TABLE IF EXISTS gql_cache_events_5m_90d; | ||
| DROP TABLE IF EXISTS gql_cache_events_raw; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| package cacheevents | ||
|
|
||
| import ( | ||
| "time" | ||
|
|
||
| cacheeventsv1 "github.com/wundergraph/cosmo/graphqlmetrics/gen/proto/wg/cosmo/cacheevents/v1" | ||
| utils "github.com/wundergraph/cosmo/graphqlmetrics/pkg/utils" | ||
| ) | ||
|
|
||
| // BatchItem is the unit of work pushed onto the cache-events batch processor. | ||
| // One BatchItem corresponds to one PublishEntityCacheEvents RPC call from | ||
| // a router; it carries the events and the JWT claims that authenticated | ||
| // the request. | ||
| type BatchItem struct { | ||
| Events []*cacheeventsv1.CacheEvent | ||
| Claims *utils.GraphAPITokenClaims | ||
| } | ||
|
|
||
| // ProcessorConfig carries the tunables for the cache-events batch processor. | ||
| // Defaults are set higher than the schema-usage processor because cache | ||
| // events are 10-100x request volume. | ||
| type ProcessorConfig struct { | ||
| MaxBatchSize int | ||
| MaxQueueSize int | ||
| MaxWorkers int | ||
| Interval time.Duration | ||
| } | ||
|
|
||
| // DefaultProcessorConfig returns the resource-isolated defaults used when no | ||
| // env-overrides are provided. These are intentionally separate from the | ||
| // schema-usage processor's defaults so a cache-events spike does not | ||
| // degrade schema-usage SLAs. | ||
| func DefaultProcessorConfig() ProcessorConfig { | ||
| return ProcessorConfig{ | ||
| MaxBatchSize: 8192, | ||
| MaxQueueSize: 131072, | ||
| MaxWorkers: 4, | ||
| Interval: 5 * time.Second, | ||
| } | ||
| } | ||
|
|
||
| // batchCost returns the number of events in the batch — used by the | ||
| // generic batchprocessor as the cost function. | ||
| func batchCost(items []BatchItem) int { | ||
| n := 0 | ||
| for _, it := range items { | ||
| n += len(it.Events) | ||
| } | ||
| return n | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| package cacheevents | ||
|
|
||
| import ( | ||
| "testing" | ||
| "time" | ||
|
|
||
| "github.com/stretchr/testify/require" | ||
| cacheeventsv1 "github.com/wundergraph/cosmo/graphqlmetrics/gen/proto/wg/cosmo/cacheevents/v1" | ||
| ) | ||
|
|
||
| func TestDefaultProcessorConfig(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| cfg := DefaultProcessorConfig() | ||
| require.Equal(t, 8192, cfg.MaxBatchSize) | ||
| require.Equal(t, 131072, cfg.MaxQueueSize) | ||
| require.Equal(t, 4, cfg.MaxWorkers) | ||
| require.Equal(t, 5*time.Second, cfg.Interval) | ||
| } | ||
|
|
||
| func TestBatchCost(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| t.Run("nil slice has zero cost", func(t *testing.T) { | ||
| require.Equal(t, 0, batchCost(nil)) | ||
| }) | ||
|
|
||
| t.Run("empty slice has zero cost", func(t *testing.T) { | ||
| require.Equal(t, 0, batchCost([]BatchItem{})) | ||
| }) | ||
|
|
||
| t.Run("sums event counts across items", func(t *testing.T) { | ||
| items := []BatchItem{ | ||
| {Events: []*cacheeventsv1.CacheEvent{{}, {}, {}}}, | ||
| {Events: []*cacheeventsv1.CacheEvent{{}}}, | ||
| {Events: nil}, | ||
| {Events: []*cacheeventsv1.CacheEvent{{}, {}}}, | ||
| } | ||
| require.Equal(t, 6, batchCost(items)) | ||
| }) | ||
|
|
||
| t.Run("item with nil events contributes zero", func(t *testing.T) { | ||
| items := []BatchItem{{Events: nil}, {Events: nil}} | ||
| require.Equal(t, 0, batchCost(items)) | ||
| }) | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.