Skip to content

Commit faf7d30

Browse files
committed
feat(network-disruption): add DNS resolver control
Adds per-host DNS resolution strategy control to allow users to specify whether hostnames should be resolved using pod or node nameservers. This addresses issues with service mesh proxies (like Istio DNS proxy) that intercept DNS queries and return VIP addresses (240.x.x.x) that don't work with tc traffic control rules. Users can now specify dnsResolver field on each host with strategies: "pod", "node", "pod-fallback-node" (default), or "node-fallback-pod". The default behavior maintains backward compatibility by trying pod DNS first with automatic fallback to node DNS. Resolves: #882 Jira: CHAOSPLT-1359
1 parent b48282a commit faf7d30

File tree

10 files changed

+274
-37
lines changed

10 files changed

+274
-37
lines changed

api/v1beta1/network_disruption.go

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ type NetworkDisruptionHostSpec struct {
105105
Flow string `json:"flow,omitempty" chaos_validate:"omitempty,oneofci=ingress egress"`
106106
// +kubebuilder:validation:Enum=new;est;""
107107
ConnState string `json:"connState,omitempty" chaos_validate:"omitempty,oneofci=new est"`
108+
// +kubebuilder:validation:Enum=pod;node;pod-fallback-node;node-fallback-pod;""
109+
DNSResolver string `json:"dnsResolver,omitempty" chaos_validate:"omitempty,oneofci=pod node pod-fallback-node node-fallback-pod"`
108110
}
109111

110112
type NetworkDisruptionServiceSpec struct {
@@ -356,12 +358,12 @@ func (s *NetworkDisruptionSpec) GenerateArgs() []string {
356358

357359
// append hosts
358360
for _, host := range s.Hosts {
359-
args = append(args, "--hosts", fmt.Sprintf("%s;%d;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState))
361+
args = append(args, "--hosts", fmt.Sprintf("%s;%d;%s;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState, host.DNSResolver))
360362
}
361363

362364
// append allowed hosts
363365
for _, host := range s.AllowedHosts {
364-
args = append(args, "--allowed-hosts", fmt.Sprintf("%s;%d;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState))
366+
args = append(args, "--allowed-hosts", fmt.Sprintf("%s;%d;%s;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState, host.DNSResolver))
365367
}
366368

367369
// append services
@@ -597,7 +599,7 @@ func (s *NetworkDisruptionCloudSpec) Explain() []string {
597599
}
598600

599601
// NetworkDisruptionHostSpecFromString parses the given hosts to host specs
600-
// The expected format for hosts is <host>;<port>;<protocol>;<flow>;<connState>
602+
// The expected format for hosts is <host>;<port>;<protocol>;<flow>;<connState>;<dnsResolver>
601603
func NetworkDisruptionHostSpecFromString(hosts []string) ([]NetworkDisruptionHostSpec, error) {
602604
var err error
603605

@@ -609,9 +611,10 @@ func NetworkDisruptionHostSpecFromString(hosts []string) ([]NetworkDisruptionHos
609611
protocol := ""
610612
flow := ""
611613
connState := ""
614+
dnsResolver := ""
612615

613-
// parse host with format <host>;<port>;<protocol>;<flow>;<connState>
614-
parsedHost := strings.SplitN(host, ";", 5)
616+
// parse host with format <host>;<port>;<protocol>;<flow>;<connState>;<dnsResolver>
617+
parsedHost := strings.SplitN(host, ";", 6)
615618

616619
// cast port to int if specified
617620
if len(parsedHost) > 1 && parsedHost[1] != "" {
@@ -636,13 +639,19 @@ func NetworkDisruptionHostSpecFromString(hosts []string) ([]NetworkDisruptionHos
636639
connState = parsedHost[4]
637640
}
638641

642+
// get DNS resolver strategy if specified
643+
if len(parsedHost) > 5 && parsedHost[5] != "" {
644+
dnsResolver = parsedHost[5]
645+
}
646+
639647
// generate host spec
640648
parsedHosts = append(parsedHosts, NetworkDisruptionHostSpec{
641-
Host: parsedHost[0],
642-
Port: port,
643-
Protocol: protocol,
644-
Flow: flow,
645-
ConnState: connState,
649+
Host: parsedHost[0],
650+
Port: port,
651+
Protocol: protocol,
652+
Flow: flow,
653+
ConnState: connState,
654+
DNSResolver: dnsResolver,
646655
})
647656
}
648657

chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,14 @@ spec:
267267
- est
268268
- ""
269269
type: string
270+
dnsResolver:
271+
enum:
272+
- pod
273+
- node
274+
- pod-fallback-node
275+
- node-fallback-pod
276+
- ""
277+
type: string
270278
flow:
271279
enum:
272280
- ingress
@@ -407,6 +415,14 @@ spec:
407415
- est
408416
- ""
409417
type: string
418+
dnsResolver:
419+
enum:
420+
- pod
421+
- node
422+
- pod-fallback-node
423+
- node-fallback-pod
424+
- ""
425+
type: string
410426
flow:
411427
enum:
412428
- ingress

chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,14 @@ spec:
268268
- est
269269
- ""
270270
type: string
271+
dnsResolver:
272+
enum:
273+
- pod
274+
- node
275+
- pod-fallback-node
276+
- node-fallback-pod
277+
- ""
278+
type: string
271279
flow:
272280
enum:
273281
- ingress
@@ -408,6 +416,14 @@ spec:
408416
- est
409417
- ""
410418
type: string
419+
dnsResolver:
420+
enum:
421+
- pod
422+
- node
423+
- pod-fallback-node
424+
- node-fallback-pod
425+
- ""
426+
type: string
411427
flow:
412428
enum:
413429
- ingress

chart/templates/generated/chaos.datadoghq.com_disruptions.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,14 @@ spec:
258258
- est
259259
- ""
260260
type: string
261+
dnsResolver:
262+
enum:
263+
- pod
264+
- node
265+
- pod-fallback-node
266+
- node-fallback-pod
267+
- ""
268+
type: string
261269
flow:
262270
enum:
263271
- ingress
@@ -398,6 +406,14 @@ spec:
398406
- est
399407
- ""
400408
type: string
409+
dnsResolver:
410+
enum:
411+
- pod
412+
- node
413+
- pod-fallback-node
414+
- node-fallback-pod
415+
- ""
416+
type: string
401417
flow:
402418
enum:
403419
- ingress

docs/network_disruption.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ If your team has specific disruption requirements around what `protocol` to disr
3333

3434
* [How do I decide my traffic flow? (Ingress vs Egress)](/docs/network_disruption/flow.md)
3535
* [What should I specify in hosts vs services?](/docs/network_disruption/hosts-and-services.md)
36+
* [How do I control DNS resolution for hostnames (e.g., bypassing Istio DNS proxy)?](/docs/network_disruption/hosts-and-services.md#case-5-controlling-dns-resolution-with-dnsresolver)
3637
* [What are `prio` qdiscs and how does the chaos-controller use them?](/docs/network_disruption/prio.md)
3738
* [How are changes in destination pods and services filtered on handled by the chaos-controller?](/docs/changes_handling.md#network-disruption-dynamic-service-resolution)
3839

docs/network_disruption/hosts-and-services.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,73 @@ If the `hosts` field contains a CIDR, the routing table is consulted. If the lis
223223
Instead of a CIDR block, hostnames can be provided in the `hosts` field. If the `chaos-controller` fails to resolve the `hosts` field to an IP address or a CIDR block, it then tries to resolve the potential hostname on each resolver listed in `/etc/resolv.conf` in order.
224224
Remember, this hostname must _not_ be a kubernetes service's hostname.
225225

226+
### Case 5: Controlling DNS Resolution with `dnsResolver`
227+
228+
When specifying hostnames in the `hosts` field, you can control which DNS resolver is used to resolve the hostname to an IP address. This is particularly useful in environments with service mesh proxies (like Istio) that intercept DNS queries and return virtual IPs (VIPs) that may not work for traffic disruption.
229+
230+
The `dnsResolver` field supports the following strategies:
231+
232+
| Strategy | Description |
233+
|---------------------|------------------------------------------------------------------------------------|
234+
| `pod` | Uses only the pod's DNS configuration (`/etc/resolv.conf`) |
235+
| `node` | Uses only the node's DNS configuration (`/mnt/host/etc/resolv.conf`) |
236+
| `pod-fallback-node` | Tries pod DNS first, falls back to node DNS if resolution fails (default behavior) |
237+
| `node-fallback-pod` | Tries node DNS first, falls back to pod DNS if resolution fails |
238+
239+
**Example: Bypassing Istio DNS Proxy**
240+
241+
When Istio DNS proxy is enabled, pod-level DNS lookups may return VIP addresses (Class E subnet: 240.0.0.0/4) that don't work for network disruptions. In this case, use `dnsResolver: node` to bypass the Istio proxy and get the actual service IPs:
242+
243+
```yaml
244+
apiVersion: chaos.datadoghq.com/v1beta1
245+
kind: Disruption
246+
metadata:
247+
name: network-disruption-istio
248+
spec:
249+
level: pod
250+
selector:
251+
app: my-service
252+
network:
253+
drop: 50
254+
hosts:
255+
- host: external-api.example.com
256+
port: 443
257+
protocol: tcp
258+
dnsResolver: node # Bypasses Istio DNS proxy, gets real IPs
259+
```
260+
261+
**Example: Multiple hosts with different DNS strategies**
262+
263+
```yaml
264+
network:
265+
drop: 50
266+
hosts:
267+
# External service - use node DNS to avoid service mesh VIPs
268+
- host: external-service.example.com
269+
port: 443
270+
protocol: tcp
271+
dnsResolver: node
272+
273+
# Internal cluster service - use pod DNS for cluster-internal resolution
274+
- host: internal-service.cluster.local
275+
port: 8080
276+
protocol: tcp
277+
dnsResolver: pod
278+
279+
# Public API - use default behavior (pod-fallback-node)
280+
- host: api.public.com
281+
port: 443
282+
protocol: tcp
283+
# dnsResolver not specified = uses default "pod-fallback-node"
284+
```
285+
286+
**When to use each strategy:**
287+
288+
- **`node`**: Use when working with service meshes (Istio, Linkerd) that proxy DNS, or when you need to resolve external hostnames using the node's DNS servers (e.g., corporate DNS servers)
289+
- **`pod`**: Use for cluster-internal services or when you specifically want to use the pod's DNS configuration
290+
- **`pod-fallback-node`** (default): Use when you want resilience - try pod DNS first but fall back to node DNS if it fails
291+
- **`node-fallback-pod`**: Use when node DNS is preferred but you want pod DNS as a backup
292+
226293
### Some special cases
227294

228295
Cluster IPs can also be specified to target the relevant pods.

injector/ipresolver.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ import (
1515
// resolveHost tries to resolve the given host
1616
// it tries to resolve it as a CIDR, as a single IP, or as a hostname
1717
// it returns a list of IP or an error if it fails to resolve the hostname
18-
func resolveHost(client network.DNSClient, host string) ([]*net.IPNet, error) {
18+
// dnsStrategy specifies the DNS resolution strategy to use (empty string uses default)
19+
func resolveHost(client network.DNSClient, host string, dnsStrategy string) ([]*net.IPNet, error) {
1920
var ips []*net.IPNet
2021

2122
// return the wildcard 0.0.0.0/0 CIDR if the given host is an empty string
@@ -33,7 +34,12 @@ func resolveHost(client network.DNSClient, host string) ([]*net.IPNet, error) {
3334
if ip == nil {
3435
// if no IP has been parsed, fallback on a hostname
3536
// and try to resolve it by using the container resolv.conf file
36-
resolvedIPs, err := client.Resolve(host)
37+
var resolvedIPs []net.IP
38+
if dnsStrategy != "" {
39+
resolvedIPs, err = client.ResolveWithStrategy(host, dnsStrategy)
40+
} else {
41+
resolvedIPs, err = client.Resolve(host)
42+
}
3743
if err != nil {
3844
return nil, fmt.Errorf("can't resolve the given host with the configured dns resolver: %w", err)
3945
}

injector/network_disruption.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,7 +1114,7 @@ func (i *networkDisruptionInjector) watchHostChanges(ctx context.Context, interf
11141114

11151115
perHost:
11161116
for host, currentTcFilters := range hosts.hostFilterMap {
1117-
newIps, err := resolveHost(i.config.DNSClient, host.Host)
1117+
newIps, err := resolveHost(i.config.DNSClient, host.Host, host.DNSResolver)
11181118
if err != nil {
11191119
hostWatcherLog.Errorw("error resolving Host", tags.ErrorKey, err, tags.HostKey, host.Host)
11201120

@@ -1204,7 +1204,7 @@ func (i *networkDisruptionInjector) addFiltersForHosts(interfaces []string, host
12041204
}
12051205

12061206
// resolve given hosts if needed
1207-
ips, err := resolveHost(i.config.DNSClient, host.Host)
1207+
ips, err := resolveHost(i.config.DNSClient, host.Host, host.DNSResolver)
12081208
if err != nil {
12091209
return nil, fmt.Errorf("error resolving given host %s: %w", host.Host, err)
12101210
}

0 commit comments

Comments
 (0)