Skip to content

Commit 1ed5738

Browse files
authored
Merge pull request #10335 from tomastigera/tomas-bpf-ip-defrag
[BPF] Support for IPv4 fragmentation
2 parents 7e83f7e + 071e846 commit 1ed5738

File tree

22 files changed

+830
-45
lines changed

22 files changed

+830
-45
lines changed

felix/bpf-gpl/bpf.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ static CALI_BPF_INLINE __attribute__((noreturn)) void bpf_exit(int rc) {
242242
#define debug_ip(ip) (bpf_htonl((ip).d))
243243
#endif
244244
#define ip_is_dnf(ip) (true)
245+
#define ip_is_frag(ip) (false)
245246

246247
#else
247248

@@ -253,7 +254,9 @@ static CALI_BPF_INLINE __attribute__((noreturn)) void bpf_exit(int rc) {
253254
#endif
254255

255256
#define ip_is_dnf(ip) ((ip)->frag_off & bpf_htons(0x4000))
256-
#define ip_frag_no(ip) ((ip)->frag_off & bpf_htons(0x1fff))
257+
#define ip_is_frag(ip) ((ip)->frag_off & bpf_htons(0x3fff))
258+
#define ip_is_first_frag(ip) (((ip)->frag_off & bpf_htons(0x3fff)) == bpf_htons(0x2000))
259+
#define ip_is_last_frag(ip) (!((ip)->frag_off & bpf_htons(0x2000)))
257260
#endif
258261

259262
#ifndef IP_FMT

felix/bpf-gpl/counters.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
#include "bpf.h"
99

10-
#define MAX_COUNTERS_SIZE 20
10+
#define MAX_COUNTERS_SIZE 22
1111

1212
typedef __u64 counters_t[MAX_COUNTERS_SIZE];
1313

@@ -25,6 +25,10 @@ CALI_MAP(cali_counters, 3,
2525
struct counters_key, counters_t, 20000,
2626
0)
2727

28+
CALI_MAP(cali_counters_scratch, 2,
29+
BPF_MAP_TYPE_PERCPU_ARRAY,
30+
__u32, counters_t, 1, 0)
31+
2832
static CALI_BPF_INLINE counters_t *counters_get(int ifindex)
2933
{
3034
struct counters_key key = {

felix/bpf-gpl/fib_co_re.h

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,30 @@ static CALI_BPF_INLINE int try_redirect_to_peer(struct cali_tc_ctx *ctx)
2828
return TC_ACT_UNSPEC;
2929
}
3030

31+
static CALI_BPF_INLINE void fib_error_log(struct cali_tc_ctx *ctx,int rc)
32+
{
33+
if (rc < 0) {
34+
CALI_DEBUG("FIB lookup failed (bad input): %d.", rc);
35+
rc = TC_ACT_UNSPEC;
36+
} else {
37+
CALI_DEBUG("FIB lookup failed (FIB problem): %d.", rc);
38+
rc = TC_ACT_UNSPEC;
39+
}
40+
}
41+
3142
static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
3243
{
3344
int rc = ctx->fwd.res;
34-
enum calico_reason reason = ctx->fwd.reason;
3545
struct cali_tc_state *state = ctx->state;
3646

3747
if (rc == TC_ACT_SHOT) {
3848
goto deny;
3949
}
4050

51+
if (ctx->state->flags & CALI_ST_SKIP_REDIR_ONCE) {
52+
goto skip_fib;
53+
}
54+
4155
if (rc == CALI_RES_REDIR_BACK) {
4256
int redir_flags = 0;
4357
if (CALI_F_FROM_HOST) {
@@ -318,6 +332,27 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
318332
rc = bpf_fib_lookup(ctx->skb, fib_params(ctx), sizeof(struct bpf_fib_lookup),
319333
ctx->fwd.fib_flags | BPF_FIB_LOOKUP_SKIP_NEIGH);
320334
switch (rc) {
335+
case BPF_FIB_LKUP_RET_FRAG_NEEDED:
336+
/* We are not asking for an MTU check, but we may still get
337+
* BPF_FIB_LKUP_RET_FRAG_NEEDED if the device is not yet UP
338+
* despite the mtu being larger than the packet
339+
* https://github.com/torvalds/linux/blob/3349ada3cffdbe4579872a004360daa31938f683/include/linux/netdevice.h#L4242
340+
* This happens on wireguard device in FV test, but the device accepts
341+
* forwarded packets. It should be just a start up issue and not a
342+
* real issue in production.
343+
*
344+
* The irony is that if we did ask for MTU check, we would not get
345+
* BPF_FIB_LKUP_RET_NO_NEIGH and all would proceed as expected.
346+
* But we do not want to ask for an MTU check for various reason
347+
* related to us growing the packets ourselves.
348+
* https://github.com/projectcalico/calico/commit/78c85f96b2aa4ae76acfaa04bb8823c2ad76f9bd
349+
*/
350+
CALI_DEBUG("mtu_result %d dev %d", fib_params(ctx)->mtu_result, fib_params(ctx)->ifindex);
351+
if (!skb_is_gso(ctx->skb) && fib_params(ctx)->mtu_result < bpf_htons(ctx->state->ip_size)) {
352+
fib_error_log(ctx, rc);
353+
rc = TC_ACT_UNSPEC;
354+
break;
355+
}
321356
case 0:
322357
case BPF_FIB_LKUP_RET_NO_NEIGH:
323358
#ifdef IPVER6
@@ -327,7 +362,7 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
327362
#endif
328363

329364
if (!fib_approve(ctx, fib_params(ctx)->ifindex)) {
330-
reason = CALI_REASON_WEP_NOT_READY;
365+
ctx->fwd.reason = CALI_REASON_WEP_NOT_READY;
331366
goto deny;
332367
}
333368

@@ -344,15 +379,8 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
344379
rc = bpf_redirect_neigh(fib_params(ctx)->ifindex, &nh_params, sizeof(nh_params), 0);
345380
break;
346381
default:
347-
if (rc < 0) {
348-
CALI_DEBUG("FIB lookup failed (bad input): %d.", rc);
349-
rc = TC_ACT_UNSPEC;
350-
} else {
351-
CALI_DEBUG("FIB lookup failed (FIB problem): %d.", rc);
352-
rc = TC_ACT_UNSPEC;
353-
}
354-
355-
break;
382+
fib_error_log(ctx, rc);
383+
rc = TC_ACT_UNSPEC;
356384
}
357385

358386
no_fib_redirect:
@@ -484,7 +512,7 @@ static CALI_BPF_INLINE int forward_or_drop(struct cali_tc_ctx *ctx)
484512

485513
if (rc == TC_ACT_SHOT) {
486514
CALI_INFO("Final result=DENY (%d). Program execution time: %lluns",
487-
reason, prog_end_time-state->prog_start_time);
515+
ctx->fwd.reason, prog_end_time-state->prog_start_time);
488516
} else {
489517
if (CALI_F_VXLAN && CALI_F_TO_HOST) {
490518
bpf_skb_change_type(ctx->skb, PACKET_HOST);

felix/bpf-gpl/ip_v4_fragment.h

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
// Project Calico BPF dataplane programs.
2+
// Copyright (c) 2020-2022 Tigera, Inc. All rights reserved.
3+
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4+
5+
#ifndef __CALI_IP_V4_FRAGMENT_H__
6+
#define __CALI_IP_V4_FRAGMENT_H__
7+
8+
#include "ip_addr.h"
9+
10+
struct frags4_key {
11+
ipv4_addr_t src;
12+
ipv4_addr_t dst;
13+
__u16 id;
14+
__u16 offset;
15+
};
16+
17+
#define MAX_FRAG 1504 /* requires multiple of 8 */
18+
19+
struct frags4_value {
20+
__u16 more_frags:1;
21+
__u16 len;
22+
__u32 __pad;
23+
char data[MAX_FRAG];
24+
};
25+
26+
CALI_MAP(cali_v4_frags, 2, BPF_MAP_TYPE_LRU_HASH, struct frags4_key, struct frags4_value, 10000, 0)
27+
28+
CALI_MAP(cali_v4_frgtmp, 2,
29+
BPF_MAP_TYPE_PERCPU_ARRAY,
30+
__u32, struct frags4_value,
31+
1, 0)
32+
33+
CALI_MAP(cali_v4_frgfwd, 2, BPF_MAP_TYPE_LRU_HASH, struct frags4_fwd_key, __u32, 10000, 0)
34+
35+
struct frags4_fwd_key {
36+
ipv4_addr_t src;
37+
ipv4_addr_t dst;
38+
__u32 ifindex; /* The stream of fragments may be crossing multiple devices */
39+
__u16 id;
40+
__u16 __pad;
41+
};
42+
43+
static CALI_BPF_INLINE struct frags4_value *frags4_get_scratch()
44+
{
45+
__u32 key = 0;
46+
return cali_v4_frgtmp_lookup_elem(&key);
47+
}
48+
49+
static CALI_BPF_INLINE bool frags4_try_assemble(struct cali_tc_ctx *ctx)
50+
{
51+
struct frags4_key k = {
52+
.src = ip_hdr(ctx)->saddr,
53+
.dst = ip_hdr(ctx)->daddr,
54+
.id = ip_hdr(ctx)->id,
55+
};
56+
57+
int i, tot_len = 0;
58+
59+
for (i = 0; i < 10; i++) {
60+
struct frags4_value *v = cali_v4_frags_lookup_elem(&k);
61+
62+
if (!v) {
63+
CALI_DEBUG("Missing IP fragment at offset %d", k.offset);
64+
return false;
65+
}
66+
67+
tot_len += v->len;
68+
69+
if(!v->more_frags) {
70+
goto assemble;
71+
}
72+
73+
k.offset += v->len;
74+
}
75+
76+
return false;
77+
78+
assemble:
79+
CALI_DEBUG("IP FRAG: Found all fragments!");
80+
81+
int off = skb_l4hdr_offset(ctx);
82+
int err = bpf_skb_change_tail(ctx->skb, off + tot_len, 0);
83+
if (err) {
84+
CALI_DEBUG("IP FRAG: bpf_skb_change_tail (len=%d) failed (err=%d)", tot_len, err);
85+
goto out;
86+
}
87+
88+
k.offset = 0;
89+
90+
for (i = 0; i < 10; i++) {
91+
struct frags4_value *v = cali_v4_frags_lookup_elem(&k);
92+
93+
if (!v) {
94+
CALI_DEBUG("IP FRAG: Missing IP fragment at offset %d", k.offset);
95+
goto out;
96+
}
97+
98+
__u16 len = v->len;
99+
if (len == 0 || len > MAX_FRAG) {
100+
goto out;
101+
}
102+
CALI_DEBUG("IP FRAG: copy %d bytes to %d", len, off);
103+
if (bpf_skb_store_bytes(ctx->skb, off, v->data, len, 0)) {
104+
CALI_DEBUG("IP FRAG: Failed to copy bytes");
105+
goto out;
106+
}
107+
108+
bool last = !v->more_frags;
109+
cali_v4_frags_delete_elem(&k);
110+
111+
if(last) {
112+
break;
113+
}
114+
115+
k.offset += v->len;
116+
off += v->len;
117+
}
118+
119+
if (parse_packet_ip(ctx) != PARSING_OK) {
120+
goto out;
121+
}
122+
123+
/* recalculate IP csum of the restored IP header */
124+
ip_hdr(ctx)->check = 0;
125+
ip_hdr(ctx)->frag_off = 0;
126+
ip_hdr(ctx)->tot_len = bpf_htons(ip_hdr(ctx)->ihl*4 + tot_len);
127+
128+
__wsum ip_csum = bpf_csum_diff(0, 0, (__u32 *)ctx->ip_header, sizeof(struct iphdr), 0);
129+
int ret = bpf_l3_csum_replace(ctx->skb, skb_iphdr_offset(ctx) + offsetof(struct iphdr, check), 0, ip_csum, 0);
130+
if (ret) {
131+
CALI_DEBUG("IP FRAG: set L3 csum failed");
132+
goto out;
133+
}
134+
135+
/* No need to recalculate L4 csum as the concatenated data should be intact. In
136+
* case of TCP/UDP, the pseudo IP header used to calculate the checksum does not
137+
* change src/dst IP, protocol and UDP/TCP length stay the same.
138+
*/
139+
140+
if (parse_packet_ip(ctx) != PARSING_OK) {
141+
goto out;
142+
}
143+
144+
return true;
145+
out:
146+
return false;
147+
}
148+
149+
static CALI_BPF_INLINE bool frags4_handle(struct cali_tc_ctx *ctx)
150+
{
151+
struct frags4_value *v = frags4_get_scratch();
152+
153+
if (!v) {
154+
goto out;
155+
}
156+
157+
struct frags4_key k = {
158+
.src = ip_hdr(ctx)->saddr,
159+
.dst = ip_hdr(ctx)->daddr,
160+
.id = ip_hdr(ctx)->id,
161+
.offset = 8 * bpf_ntohs(ip_hdr(ctx)->frag_off) & 0x1fff,
162+
163+
};
164+
165+
int i;
166+
int r_off = skb_l4hdr_offset(ctx);
167+
bool more_frags = bpf_ntohs(ip_hdr(ctx)->frag_off) & 0x2000;
168+
169+
/* When we get a fragment, it may be large than the storage in the map.
170+
* We may need to break it into multiple fragments to be able to store
171+
* it.
172+
*/
173+
for (i = 0; i < 10; i++) {
174+
int sz = MAX_FRAG;
175+
if (r_off + sz >= ctx->skb->len) {
176+
sz = ctx->skb->len - r_off;
177+
}
178+
if (sz > MAX_FRAG) {
179+
sz = MAX_FRAG;
180+
}
181+
if (sz <= 0) {
182+
goto out;
183+
}
184+
185+
if (bpf_skb_load_bytes(ctx->skb, r_off, v->data, sz)) {
186+
CALI_DEBUG("IP FRAG: failed to read data");
187+
goto out;
188+
}
189+
v->len = (__u16)sz;
190+
v->more_frags = more_frags || r_off + sz < ctx->skb->len;
191+
CALI_DEBUG("IP FRAG: frg off %d", k.offset);
192+
CALI_DEBUG("IP FRAG: frg size %d r_off %d", sz, r_off);
193+
194+
if (cali_v4_frags_update_elem(&k, v, 0)) {
195+
CALI_DEBUG("IP FRAG: Failed to save IP fragment.");
196+
goto out;
197+
}
198+
199+
r_off += sz;
200+
k.offset += sz;
201+
if (r_off >= ctx->skb->len) {
202+
break;
203+
}
204+
}
205+
206+
if (!frags4_try_assemble(ctx)) {
207+
goto out;
208+
}
209+
210+
211+
return true;
212+
213+
out:
214+
return false;
215+
}
216+
217+
static CALI_BPF_INLINE void frags4_record_ct(struct cali_tc_ctx *ctx)
218+
{
219+
struct frags4_fwd_key k = {
220+
.src = ip_hdr(ctx)->saddr,
221+
.dst = ip_hdr(ctx)->daddr,
222+
.ifindex = ctx->skb->ifindex,
223+
.id = ip_hdr(ctx)->id,
224+
};
225+
226+
__u32 v = 0;
227+
228+
cali_v4_frgfwd_update_elem(&k, &v, 0);
229+
CALI_DEBUG("IP FRAG: created ct from " IP_FMT " to " IP_FMT,
230+
debug_ip(ctx->state->ip_src), debug_ip(ctx->state->ip_dst));
231+
}
232+
233+
static CALI_BPF_INLINE void frags4_remove_ct(struct cali_tc_ctx *ctx)
234+
{
235+
struct frags4_fwd_key k = {
236+
.src = ip_hdr(ctx)->saddr,
237+
.dst = ip_hdr(ctx)->daddr,
238+
.ifindex = ctx->skb->ifindex,
239+
.id = ip_hdr(ctx)->id,
240+
};
241+
242+
cali_v4_frgfwd_delete_elem(&k);
243+
CALI_DEBUG("IP FRAG: killed ct from " IP_FMT " to " IP_FMT,
244+
debug_ip(ctx->state->ip_src), debug_ip(ctx->state->ip_dst));
245+
}
246+
247+
static CALI_BPF_INLINE bool frags4_lookup_ct(struct cali_tc_ctx *ctx)
248+
{
249+
struct frags4_fwd_key k = {
250+
.src = ip_hdr(ctx)->saddr,
251+
.dst = ip_hdr(ctx)->daddr,
252+
.ifindex = ctx->skb->ifindex,
253+
.id = ip_hdr(ctx)->id,
254+
};
255+
256+
CALI_DEBUG("IP FRAG: lookup ct from " IP_FMT " to " IP_FMT,
257+
debug_ip(ctx->state->ip_src), debug_ip(ctx->state->ip_dst));
258+
return cali_v4_frgfwd_lookup_elem(&k) != NULL;
259+
}
260+
261+
#endif /* __CALI_IP_V4_FRAGMENT_H__ */

0 commit comments

Comments
 (0)