Skip to content

Commit ca4a2b4

Browse files
committed
[BPF] Support for IPv4 fragmentation
Incoming IP fragments are stored in an LRU hash map. They can arrive out of order. After each fragment, we check whether we have all fragments. If any fragment is missing, we drop the skb as we cannot let it through. Once we have all fragments, we use the current skb to assemble the whole packet, we parse it again and we let it process by the rest of the programs like if the packet arrived as a single chunk. We need to defragment incoming packets because we would not be able to pass then through policies that match on more than IP. Also we would not be able to match them to connections in conntrack. In fact, the payload of the fragments would be wrongly treated as L4 headers and misinterpreted. After a packet is reassebled, fragments are deleted. If for any reason we never see all fragments, LRU will kick out the stored fragments eventually. There are some limitations: * packet cannot have more than 10 fragments - 10 is arbitrary number greater than a reasonable number of fragments in modern networks (2) plus we fragment the packet internally into 1500 chunks in case the fragments were bigger than this - unlikely, but not impossible. However, there is no limit on fragmentation in any RFC except the smallest MTu of 576 bytes. * we can store up to 10k fragments - 10k is again arbitrary. If there is a higher fragmentation rate than this, eBPF dataplane is probably not the right choice as performance would suffer and it is likely better to let generic Linux handle such cases. * defragmentation is meant to handle corner cases and is not meant to be performant.
1 parent f0964ff commit ca4a2b4

File tree

8 files changed

+501
-14
lines changed

8 files changed

+501
-14
lines changed

felix/bpf-gpl/bpf.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ static CALI_BPF_INLINE __attribute__((noreturn)) void bpf_exit(int rc) {
242242
#define debug_ip(ip) (bpf_htonl((ip).d))
243243
#endif
244244
#define ip_is_dnf(ip) (true)
245+
#define ip_is_frag(ip) (false)
245246

246247
#else
247248

@@ -253,7 +254,7 @@ static CALI_BPF_INLINE __attribute__((noreturn)) void bpf_exit(int rc) {
253254
#endif
254255

255256
#define ip_is_dnf(ip) ((ip)->frag_off & bpf_htons(0x4000))
256-
#define ip_frag_no(ip) ((ip)->frag_off & bpf_htons(0x1fff))
257+
#define ip_is_frag(ip) ((ip)->frag_off & bpf_htons(0x3fff))
257258
#endif
258259

259260
#ifndef IP_FMT

felix/bpf-gpl/ip_v4_fragment.h

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
// Project Calico BPF dataplane programs.
2+
// Copyright (c) 2020-2022 Tigera, Inc. All rights reserved.
3+
// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
4+
5+
#ifndef __CALI_IP_V4_FRAGMENT_H__
6+
#define __CALI_IP_V4_FRAGMENT_H__
7+
8+
#include "ip_addr.h"
9+
10+
struct frags4_key {
11+
ipv4_addr_t src;
12+
ipv4_addr_t dst;
13+
__u16 id;
14+
__u16 offset;
15+
};
16+
17+
#define MAX_FRAG 1504 /* requires multiple of 8 */
18+
19+
struct frags4_value {
20+
__u16 more_frags:1;
21+
__u16 len;
22+
__u32 __pad;
23+
char data[MAX_FRAG];
24+
};
25+
26+
CALI_MAP(cali_v4_frags, 2, BPF_MAP_TYPE_LRU_HASH, struct frags4_key, struct frags4_value, 10000, 0)
27+
28+
CALI_MAP(cali_v4_frgtmp, 2,
29+
BPF_MAP_TYPE_PERCPU_ARRAY,
30+
__u32, struct frags4_value,
31+
1, 0)
32+
33+
static CALI_BPF_INLINE struct frags4_value *frags4_get_scratch()
34+
{
35+
__u32 key = 0;
36+
return cali_v4_frgtmp_lookup_elem(&key);
37+
}
38+
39+
static CALI_BPF_INLINE bool frags4_try_assemble(struct cali_tc_ctx *ctx)
40+
{
41+
struct frags4_key k = {
42+
.src = ip_hdr(ctx)->saddr,
43+
.dst = ip_hdr(ctx)->daddr,
44+
.id = ip_hdr(ctx)->id,
45+
};
46+
47+
int i, tot_len = 0;
48+
49+
for (i = 0; i < 10; i++) {
50+
struct frags4_value *v = cali_v4_frags_lookup_elem(&k);
51+
52+
if (!v) {
53+
CALI_DEBUG("Missing IP fragment at offset %d", k.offset);
54+
goto out;
55+
}
56+
57+
tot_len += v->len;
58+
59+
if(!v->more_frags) {
60+
goto assemble;
61+
}
62+
63+
k.offset += v->len;
64+
}
65+
66+
goto out;
67+
68+
assemble:
69+
CALI_DEBUG("IP FRAG: Found all fragments!");
70+
71+
int off = skb_l4hdr_offset(ctx);
72+
int err = bpf_skb_change_tail(ctx->skb, off + tot_len, 0);
73+
if (err) {
74+
CALI_DEBUG("IP FRAG: bpf_skb_change_tail (len=%d) failed (err=%d)", tot_len, err);
75+
goto out;
76+
}
77+
78+
k.offset = 0;
79+
80+
for (i = 0; i < 10; i++) {
81+
struct frags4_value *v = cali_v4_frags_lookup_elem(&k);
82+
83+
if (!v) {
84+
CALI_DEBUG("IP FRAG: Missing IP fragment at offset %d", k.offset);
85+
goto out;
86+
}
87+
88+
__u16 len = v->len;
89+
if (len == 0 || len > MAX_FRAG) {
90+
goto out;
91+
}
92+
CALI_DEBUG("IP FRAG: copy %d bytes to %d", len, off);
93+
if (bpf_skb_store_bytes(ctx->skb, off, v->data, len, 0)) {
94+
CALI_DEBUG("IP FRAG: Failed to copy bytes");
95+
goto out;
96+
}
97+
98+
bool last = !v->more_frags;
99+
cali_v4_frags_delete_elem(&k);
100+
101+
if(last) {
102+
break;
103+
}
104+
105+
k.offset += v->len;
106+
off += v->len;
107+
}
108+
109+
if (parse_packet_ip(ctx) != PARSING_OK) {
110+
goto out;
111+
}
112+
113+
/* recalculate IP csum of the restored IP header */
114+
ip_hdr(ctx)->check = 0;
115+
ip_hdr(ctx)->frag_off = 0;
116+
ip_hdr(ctx)->tot_len = bpf_htons(ip_hdr(ctx)->ihl*4 + tot_len);
117+
118+
__wsum ip_csum = bpf_csum_diff(0, 0, (__u32 *)ctx->ip_header, sizeof(struct iphdr), 0);
119+
int ret = bpf_l3_csum_replace(ctx->skb, skb_iphdr_offset(ctx) + offsetof(struct iphdr, check), 0, ip_csum, 0);
120+
if (ret) {
121+
CALI_DEBUG("IP FRAG: set L3 csum failed");
122+
goto out;
123+
}
124+
125+
/* No need to recalculate L4 csum as the concatenated data should be intact. In
126+
* case of TCP/UDP, the pseudo IP header used to calculate the checksum does not
127+
* change src/dst IP, protocol and UDP/TCP length stay the same.
128+
*/
129+
130+
if (parse_packet_ip(ctx) != PARSING_OK) {
131+
goto out;
132+
}
133+
134+
return true;
135+
out:
136+
return false;
137+
}
138+
139+
static CALI_BPF_INLINE bool frags4_handle(struct cali_tc_ctx *ctx)
140+
{
141+
struct frags4_value *v = frags4_get_scratch();
142+
143+
if (!v) {
144+
goto out;
145+
}
146+
147+
struct frags4_key k = {
148+
.src = ip_hdr(ctx)->saddr,
149+
.dst = ip_hdr(ctx)->daddr,
150+
.id = ip_hdr(ctx)->id,
151+
.offset = 8 * bpf_ntohs(ip_hdr(ctx)->frag_off) & 0x1fff,
152+
153+
};
154+
155+
int i;
156+
int r_off = skb_l4hdr_offset(ctx);
157+
bool more_frags = bpf_ntohs(ip_hdr(ctx)->frag_off) & 0x2000;
158+
159+
for (i = 0; i < 10; i++) {
160+
int sz = MAX_FRAG;
161+
if (r_off + sz >= ctx->skb->len) {
162+
sz = ctx->skb->len - r_off;
163+
}
164+
if (sz > MAX_FRAG) {
165+
sz = MAX_FRAG;
166+
}
167+
if (sz <= 0) {
168+
goto out;
169+
}
170+
171+
if (bpf_skb_load_bytes(ctx->skb, r_off, v->data, sz)) {
172+
CALI_DEBUG("IP FRAG: failed to read data");
173+
goto out;
174+
}
175+
v->len = (__u16)sz;
176+
v->more_frags = more_frags || r_off + sz < ctx->skb->len;
177+
CALI_DEBUG("IP FRAG: frg off %d", k.offset);
178+
CALI_DEBUG("IP FRAG: frg size %d r_off %d", sz, r_off);
179+
180+
if (cali_v4_frags_update_elem(&k, v, 0)) {
181+
CALI_DEBUG("IP FRAG: Failed to save IP fragment.");
182+
goto out;
183+
}
184+
185+
r_off += sz;
186+
k.offset += sz;
187+
if (r_off >= ctx->skb->len) {
188+
break;
189+
}
190+
}
191+
192+
if (!frags4_try_assemble(ctx)) {
193+
goto out;
194+
}
195+
196+
197+
return true;
198+
199+
out:
200+
return false;
201+
}
202+
203+
#endif /* __CALI_IP_V4_FRAGMENT_H__ */

felix/bpf-gpl/tc.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@
5656
#include "bpf_helpers.h"
5757
#include "rule_counters.h"
5858

59+
#ifndef IPVER6
60+
#include "ip_v4_fragment.h"
61+
#endif
62+
5963
#define HAS_HOST_CONFLICT_PROG CALI_F_TO_HEP
6064

6165
/* calico_tc_main is the main function used in all of the tc programs. It is specialised
@@ -197,11 +201,26 @@ int calico_tc_main(struct __sk_buff *skb)
197201
ctx->fwd.res = TC_ACT_SHOT;
198202
goto finalize;
199203
}
204+
205+
#ifndef IPVER6
206+
if (CALI_F_TO_HOST && ip_is_frag(ip_hdr(ctx))) {
207+
if (!frags4_handle(ctx)) {
208+
goto deny;
209+
}
210+
}
211+
#endif
212+
200213
return pre_policy_processing(ctx);
201214

202215
allow:
203216
finalize:
204217
return forward_or_drop(ctx);
218+
219+
#ifndef IPVER6
220+
deny:
221+
ctx->fwd.res = TC_ACT_SHOT;
222+
goto finalize;
223+
#endif
205224
}
206225

207226
static CALI_BPF_INLINE int pre_policy_processing(struct cali_tc_ctx *ctx)
@@ -1674,7 +1693,7 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct cali_tc_ctx *ctx
16741693
state->icmp_type = ICMPV6_TIME_EXCEED;
16751694
state->icmp_code = ICMPV6_EXC_HOPLIMIT;
16761695
#else
1677-
if (ip_frag_no(ip_hdr(ctx))) {
1696+
if (ip_is_frag(ip_hdr(ctx))) {
16781697
goto deny;
16791698
}
16801699
state->icmp_type = ICMP_TIME_EXCEEDED;

felix/bpf/ipfrags/map.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Copyright (c) 2025 Tigera, Inc. All rights reserved.
2+
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package ipfrags
16+
17+
import (
18+
"github.com/projectcalico/calico/felix/bpf/maps"
19+
)
20+
21+
func init() {
22+
maps.SetSize(MapParams.VersionedName(), MapParams.MaxEntries)
23+
}
24+
25+
var MapParams = maps.MapParameters{
26+
Type: "lru_hash",
27+
KeySize: KeySize,
28+
ValueSize: ValueSize,
29+
MaxEntries: 10000, // max number of nodes that can forward nodeports to a single node
30+
Name: "cali_v4_frags",
31+
Version: 2,
32+
}
33+
34+
const (
35+
KeySize = 12
36+
ValueSize = 2 + 2 + 4 + 1504
37+
)
38+
39+
func Map() maps.Map {
40+
return maps.NewPinnedMap(MapParams)
41+
}
42+
43+
var MapParameters = maps.MapParameters{
44+
Type: "percpu_array",
45+
KeySize: 4,
46+
ValueSize: ValueSize,
47+
MaxEntries: 1,
48+
Name: "cali_v4_frgtmp",
49+
Version: 2,
50+
}
51+
52+
func MapTmp() maps.Map {
53+
return maps.NewPinnedMap(MapParameters)
54+
}

felix/bpf/ut/bpf_prog_test.go

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/projectcalico/calico/felix/bpf/failsafes"
4646
"github.com/projectcalico/calico/felix/bpf/hook"
4747
"github.com/projectcalico/calico/felix/bpf/ifstate"
48+
"github.com/projectcalico/calico/felix/bpf/ipfrags"
4849
"github.com/projectcalico/calico/felix/bpf/ipsets"
4950
"github.com/projectcalico/calico/felix/bpf/jump"
5051
"github.com/projectcalico/calico/felix/bpf/libbpf"
@@ -576,12 +577,12 @@ func bpftool(args ...string) ([]byte, error) {
576577
var (
577578
mapInitOnce sync.Once
578579

579-
natMap, natBEMap, ctMap, rtMap, ipsMap, testStateMap, affinityMap, arpMap, fsafeMap maps.Map
580-
natMapV6, natBEMapV6, ctMapV6, rtMapV6, ipsMapV6, affinityMapV6, arpMapV6, fsafeMapV6 maps.Map
581-
stateMap, countersMap, ifstateMap, progMap, progMapXDP, policyJumpMap, policyJumpMapXDP maps.Map
582-
perfMap maps.Map
583-
profilingMap maps.Map
584-
allMaps []maps.Map
580+
natMap, natBEMap, ctMap, rtMap, ipsMap, testStateMap, affinityMap, arpMap, fsafeMap, ipfragsMap maps.Map
581+
natMapV6, natBEMapV6, ctMapV6, rtMapV6, ipsMapV6, affinityMapV6, arpMapV6, fsafeMapV6 maps.Map
582+
stateMap, countersMap, ifstateMap, progMap, progMapXDP, policyJumpMap, policyJumpMapXDP maps.Map
583+
perfMap maps.Map
584+
profilingMap, ipfragsMapTmp maps.Map
585+
allMaps []maps.Map
585586
)
586587

587588
func initMapsOnce() {
@@ -605,6 +606,8 @@ func initMapsOnce() {
605606
fsafeMap = failsafes.Map()
606607
fsafeMapV6 = failsafes.MapV6()
607608
countersMap = counters.Map()
609+
ipfragsMap = ipfrags.Map()
610+
ipfragsMapTmp = ipfrags.MapTmp()
608611
ifstateMap = ifstate.Map()
609612
policyJumpMap = jump.Map()
610613
policyJumpMapXDP = jump.XDPMap()
@@ -614,7 +617,7 @@ func initMapsOnce() {
614617

615618
allMaps = []maps.Map{natMap, natBEMap, natMapV6, natBEMapV6, ctMap, ctMapV6, rtMap, rtMapV6, ipsMap, ipsMapV6,
616619
stateMap, testStateMap, affinityMap, affinityMapV6, arpMap, arpMapV6, fsafeMap, fsafeMapV6,
617-
countersMap, ifstateMap, profilingMap,
620+
countersMap, ipfragsMap, ipfragsMapTmp, ifstateMap, profilingMap,
618621
policyJumpMap, policyJumpMapXDP}
619622
for _, m := range allMaps {
620623
err := m.EnsureExists()
@@ -639,7 +642,7 @@ func cleanUpMaps() {
639642
defer log.SetLevel(logLevel)
640643

641644
for _, m := range allMaps {
642-
if m == stateMap || m == testStateMap || m == progMap || m == countersMap {
645+
if m == stateMap || m == testStateMap || m == progMap || m == countersMap || m == ipfragsMapTmp {
643646
continue // Can't clean up array maps
644647
}
645648
log.WithField("map", m.GetName()).Info("Cleaning")

0 commit comments

Comments
 (0)