Skip to content

Commit 2d9666b

Browse files
committed
eBPF correctness improvements for IPv6, ICMP and CGroup
- Walk IPv6 extension headers (HopByHop, Routing, Fragment, Destination) to find the real transport protocol instead of using nexthdr directly; update TCP/UDP/ICMPv6 header pointers to use the resolved transport ptr - Add IPv6 extension header protocol constants to counter.h - Fix ip6_send_skb kprobe to use sk->sk_protocol instead of ip6hdr->nexthdr so packets with extension headers are counted correctly - Fix process_udp_send to validate skb before reading udphdr fields - Populate cgroupid in process_cgroup_skb via bpf_get_current_cgroup_id() - Compute accurate ICMP/ICMPv6 error response sizes per RFC 792/4443 (8-byte header + IP header + 8 bytes of original data) instead of reusing the full triggering payload length - Skip update_val in icmp_rcv when msglen is zero - Log perf read errors in cGroupWatcher instead of silently sleeping - Return partial batch results alongside error in listMapBatch
1 parent 5f59819 commit 2d9666b

File tree

6 files changed

+85
-18
lines changed

6 files changed

+85
-18
lines changed

bpf/counter.bpf.c

Lines changed: 75 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -182,14 +182,42 @@ process_ip6(struct ipv6hdr *ip6, void *data_end, statkey *key) {
182182
return NOK;
183183
}
184184

185-
// IPv6 copy of source IP, destination IP and transport protocol
185+
// IPv6 copy of source IP and destination IP
186186
key->srcip = ip6->saddr;
187187
key->dstip = ip6->daddr;
188-
key->proto = ip6->nexthdr;
189188

190-
switch (ip6->nexthdr) {
189+
// Walk optional extension headers to find the actual transport protocol.
190+
// Variable-length extension headers (HopByHop, Routing, Destination):
191+
// total size = (hdr_ext_len + 1) * 8 bytes; next_header at offset 0.
192+
// Fragment header: fixed 8 bytes; next_header at offset 0.
193+
__u8 nexthdr = ip6->nexthdr;
194+
void *transport = (void *)ip6 + sizeof(*ip6);
195+
196+
#pragma unroll
197+
for (int i = 0; i < 6; i++) {
198+
if (nexthdr != IPPROTO_HOPOPTS && nexthdr != IPPROTO_ROUTING &&
199+
nexthdr != IPPROTO_FRAGMENT && nexthdr != IPPROTO_DSTOPTS) {
200+
break;
201+
}
202+
if (nexthdr == IPPROTO_FRAGMENT) {
203+
if (transport + 8 > data_end)
204+
return NOK;
205+
nexthdr = ((__u8 *)transport)[0];
206+
transport += 8;
207+
} else {
208+
if (transport + 2 > data_end)
209+
return NOK;
210+
__u8 hdrlen = ((__u8 *)transport)[1];
211+
nexthdr = ((__u8 *)transport)[0];
212+
transport += ((__u32)(hdrlen + 1) * 8);
213+
}
214+
}
215+
216+
key->proto = nexthdr;
217+
218+
switch (nexthdr) {
191219
case IPPROTO_TCP: {
192-
struct tcphdr *tcp = (void *)ip6 + sizeof(*ip6);
220+
struct tcphdr *tcp = transport;
193221

194222
// validate TCP size
195223
if ((void *)tcp + sizeof(*tcp) > data_end) {
@@ -202,7 +230,7 @@ process_ip6(struct ipv6hdr *ip6, void *data_end, statkey *key) {
202230
break;
203231
}
204232
case IPPROTO_UDP: {
205-
struct udphdr *udp = (void *)ip6 + sizeof(*ip6);
233+
struct udphdr *udp = transport;
206234

207235
// validate UDP size
208236
if ((void *)udp + sizeof(*udp) > data_end) {
@@ -215,7 +243,7 @@ process_ip6(struct ipv6hdr *ip6, void *data_end, statkey *key) {
215243
break;
216244
}
217245
case IPPROTO_ICMPV6: {
218-
struct icmp6hdr *icmp = (void *)ip6 + sizeof(*ip6);
246+
struct icmp6hdr *icmp = transport;
219247

220248
// validate ICMPv6 size
221249
if ((void *)icmp + sizeof(*icmp) > data_end) {
@@ -355,6 +383,8 @@ process_cgroup_skb(struct __sk_buff *skb) {
355383
__builtin_memcpy(key.comm, ski->comm, sizeof(key.comm));
356384
}
357385

386+
key.cgroupid = bpf_get_current_cgroup_id();
387+
358388
statvalue *val = (statvalue *)bpf_map_lookup_elem(&pkt_count, &key);
359389
if (val) {
360390
// atomic XADD, doesn't need bpf_spin_lock()
@@ -769,12 +799,12 @@ process_icmp6(struct sk_buff *skb, statkey *key, pid_t pid) {
769799
*/
770800
static inline __attribute__((always_inline)) size_t
771801
process_udp_send(struct sk_buff *skb, statkey *key, pid_t pid) {
802+
if (!process_udp_recv(false, skb, key, pid))
803+
return 0;
804+
772805
struct udphdr *udphdr =
773806
(struct udphdr *)(BPF_CORE_READ(skb, head) +
774807
BPF_CORE_READ(skb, transport_header));
775-
776-
if (!process_udp_recv(false, skb, key, pid))
777-
return 0;
778808
size_t msglen = bpf_ntohs(BPF_CORE_READ(udphdr, len));
779809

780810
return msglen;
@@ -1028,10 +1058,11 @@ int BPF_KPROBE(ip_send_skb, struct net *net, struct sk_buff *skb) {
10281058
*/
10291059
SEC("kprobe/ip6_send_skb")
10301060
int BPF_KPROBE(ip6_send_skb, struct sk_buff *skb) {
1031-
struct ipv6hdr *ip6hdr =
1032-
(struct ipv6hdr *)(BPF_CORE_READ(skb, head) +
1033-
BPF_CORE_READ(skb, network_header));
1034-
if (BPF_CORE_READ(ip6hdr, nexthdr) != IPPROTO_UDP) {
1061+
// Use sk_protocol from the socket rather than ip6hdr->nexthdr so that
1062+
// IPv6 packets with extension headers (where nexthdr != IPPROTO_UDP) are
1063+
// still accounted for correctly.
1064+
struct sock *sk = BPF_CORE_READ(skb, sk);
1065+
if (!sk || BPF_CORE_READ(sk, sk_protocol) != IPPROTO_UDP) {
10351066
return 0;
10361067
}
10371068

@@ -1120,7 +1151,10 @@ int BPF_KPROBE(__icmp_send, struct sk_buff *skb, __u8 type, __u8 code,
11201151

11211152
key.cgroupid = bpf_get_current_cgroup_id();
11221153

1123-
size_t msglen = process_icmp4(skb, &key, pid);
1154+
/* process_icmp4 populates src/dst IPs from the triggering packet's IP
1155+
* header; its returned msglen (triggering payload) is not used here. */
1156+
process_icmp4(skb, &key, pid);
1157+
11241158
/* __icmp_send is called with the triggering packet's skb, so src/dst IPs
11251159
* come from the original packet (src=sender, dst=us). Swap them to reflect
11261160
* the actual ICMP error direction (src=us, dst=original sender).
@@ -1131,6 +1165,20 @@ int BPF_KPROBE(__icmp_send, struct sk_buff *skb, __u8 type, __u8 code,
11311165
key.dstip = tmp_ip;
11321166
key.src_port = type;
11331167
key.dst_port = code;
1168+
1169+
/* Compute the actual ICMP error response size per RFC 792 / RFC 1812:
1170+
* 8-byte ICMP header + original IP header + first 8 bytes of original data.
1171+
* Using the triggering packet's tot_len (as process_icmp4 did) over-counts
1172+
* by including the full original payload instead of just the 8-byte excerpt.
1173+
*/
1174+
struct iphdr *iphdr = (struct iphdr *)(BPF_CORE_READ(skb, head) +
1175+
BPF_CORE_READ(skb, network_header));
1176+
__u16 tot_len = bpf_ntohs(BPF_CORE_READ(iphdr, tot_len));
1177+
__u16 ihl_bytes = (__u16)(BPF_CORE_READ_BITFIELD_PROBED(iphdr, ihl) * 4);
1178+
__u16 payload = (ihl_bytes <= tot_len) ? (tot_len - ihl_bytes) : 0;
1179+
size_t msglen =
1180+
sizeof(struct icmphdr) + ihl_bytes + (payload > 8 ? 8 : payload);
1181+
11341182
update_val(&key, msglen);
11351183

11361184
return 0;
@@ -1165,7 +1213,10 @@ int BPF_KPROBE(icmp6_send, struct sk_buff *skb, __u8 type, __u8 code,
11651213

11661214
key.cgroupid = bpf_get_current_cgroup_id();
11671215

1168-
size_t msglen = process_icmp6(skb, &key, pid);
1216+
/* process_icmp6 populates src/dst IPs from the triggering packet's IPv6
1217+
* header; its returned msglen (triggering payload_len) is not used here. */
1218+
process_icmp6(skb, &key, pid);
1219+
11691220
/* icmp6_send is called with the triggering packet's skb, so src/dst IPs
11701221
* come from the original packet (src=sender, dst=us). Swap them to reflect
11711222
* the actual ICMPv6 error direction (src=us, dst=original sender).
@@ -1176,6 +1227,13 @@ int BPF_KPROBE(icmp6_send, struct sk_buff *skb, __u8 type, __u8 code,
11761227
key.dstip = tmp_ip;
11771228
key.src_port = type;
11781229
key.dst_port = code;
1230+
1231+
/* Compute the actual ICMPv6 error response size per RFC 4443:
1232+
* 8-byte ICMPv6 header + fixed 40-byte IPv6 header + first 8 bytes of
1233+
* original data. The triggering payload_len (as process_icmp6 returned)
1234+
* over-counts by including the full original payload. */
1235+
size_t msglen = sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr) + 8;
1236+
11791237
update_val(&key, msglen);
11801238

11811239
return 0;
@@ -1207,7 +1265,8 @@ int BPF_KPROBE(icmp_rcv, struct sk_buff *skb) {
12071265
key.cgroupid = bpf_get_current_cgroup_id();
12081266

12091267
size_t msglen = process_icmp4(skb, &key, pid);
1210-
update_val(&key, msglen);
1268+
if (msglen > 0)
1269+
update_val(&key, msglen);
12111270

12121271
return 0;
12131272
}

bpf/counter.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,9 @@
4040

4141
#define TASK_COMM_LEN 16
4242
#define MAX_ENTRIES 131072
43+
44+
// IPv6 extension header protocol numbers
45+
#define IPPROTO_HOPOPTS 0 // Hop-by-Hop Options
46+
#define IPPROTO_ROUTING 43 // Routing Header
47+
#define IPPROTO_FRAGMENT 44 // Fragment Header
48+
#define IPPROTO_DSTOPTS 60 // Destination Options

cgroup.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"encoding/binary"
2626
"errors"
2727
"io/fs"
28+
"log"
2829
"os"
2930
"path/filepath"
3031
"strconv"
@@ -198,7 +199,8 @@ func cGroupWatcher(objs cgroupObjects) (*perf.Reader, error) {
198199
return
199200
}
200201

201-
// avoid tight-spinning on persistent errors
202+
// log and avoid tight-spinning on persistent errors
203+
log.Printf("cGroupWatcher: perf read error: %v", err)
202204
time.Sleep(10 * time.Millisecond)
203205

204206
continue

counter_arm64_bpfel.o

13 KB
Binary file not shown.

counter_x86_bpfel.o

13 KB
Binary file not shown.

map.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ func listMapBatch(m *ebpf.Map, start time.Time) ([]statEntry, error) {
132132
break
133133
}
134134

135-
return nil, err
135+
return stats, err
136136
}
137137
}
138138

0 commit comments

Comments
 (0)