Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 27 additions & 10 deletions api/v1beta1/network_disruption.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ type NetworkDisruptionHostSpec struct {
Flow string `json:"flow,omitempty" chaos_validate:"omitempty,oneofci=ingress egress"`
// +kubebuilder:validation:Enum=new;est;""
ConnState string `json:"connState,omitempty" chaos_validate:"omitempty,oneofci=new est"`
// +kubebuilder:validation:Enum=pod;node;pod-fallback-node;node-fallback-pod;""
DNSResolver string `json:"dnsResolver,omitempty" chaos_validate:"omitempty,oneofci=pod node pod-fallback-node node-fallback-pod"`
}

type NetworkDisruptionServiceSpec struct {
Expand Down Expand Up @@ -356,12 +358,20 @@ func (s *NetworkDisruptionSpec) GenerateArgs() []string {

// append hosts
for _, host := range s.Hosts {
args = append(args, "--hosts", fmt.Sprintf("%s;%d;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState))
if host.DNSResolver == "" {
args = append(args, "--hosts", fmt.Sprintf("%s;%d;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not just having host.DNSResolver set to "" even if it's empty? Seems like we also add empty "" if nothing is defined in other fields. I don't think we need any ifs?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another way we could have is to have the default be set before? That way it's never empty? (although I can't remember if it's something we do in the chaos-controller or not, so ignore it if we do not do this usually)

} else {
args = append(args, "--hosts", fmt.Sprintf("%s;%d;%s;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState, host.DNSResolver))
}
}

// append allowed hosts
for _, host := range s.AllowedHosts {
args = append(args, "--allowed-hosts", fmt.Sprintf("%s;%d;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState))
if host.DNSResolver == "" {
args = append(args, "--allowed-hosts", fmt.Sprintf("%s;%d;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState))
} else {
args = append(args, "--allowed-hosts", fmt.Sprintf("%s;%d;%s;%s;%s;%s", host.Host, host.Port, host.Protocol, host.Flow, host.ConnState, host.DNSResolver))
}
}

// append services
Expand Down Expand Up @@ -597,7 +607,7 @@ func (s *NetworkDisruptionCloudSpec) Explain() []string {
}

// NetworkDisruptionHostSpecFromString parses the given hosts to host specs
// The expected format for hosts is <host>;<port>;<protocol>;<flow>;<connState>
// The expected format for hosts is <host>;<port>;<protocol>;<flow>;<connState>;<dnsResolver>
func NetworkDisruptionHostSpecFromString(hosts []string) ([]NetworkDisruptionHostSpec, error) {
var err error

Expand All @@ -609,9 +619,10 @@ func NetworkDisruptionHostSpecFromString(hosts []string) ([]NetworkDisruptionHos
protocol := ""
flow := ""
connState := ""
dnsResolver := ""

// parse host with format <host>;<port>;<protocol>;<flow>;<connState>
parsedHost := strings.SplitN(host, ";", 5)
// parse host with format <host>;<port>;<protocol>;<flow>;<connState>;<dnsResolver>
parsedHost := strings.SplitN(host, ";", 6)

// cast port to int if specified
if len(parsedHost) > 1 && parsedHost[1] != "" {
Expand All @@ -636,13 +647,19 @@ func NetworkDisruptionHostSpecFromString(hosts []string) ([]NetworkDisruptionHos
connState = parsedHost[4]
}

// get DNS resolver strategy if specified
if len(parsedHost) > 5 && parsedHost[5] != "" {
dnsResolver = parsedHost[5]
}

// generate host spec
parsedHosts = append(parsedHosts, NetworkDisruptionHostSpec{
Host: parsedHost[0],
Port: port,
Protocol: protocol,
Flow: flow,
ConnState: connState,
Host: parsedHost[0],
Port: port,
Protocol: protocol,
Flow: flow,
ConnState: connState,
DNSResolver: dnsResolver,
})
}

Expand Down
60 changes: 60 additions & 0 deletions api/v1beta1/network_disruption_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,66 @@ var _ = Describe("NetworkDisruptionSpec", func() {

return expectedArgs
}(),
),
Entry("with DNSResolver set on hosts",
func() NetworkDisruptionSpec {
networkDisruption := defaultNetworkDisruption.DeepCopy()
networkDisruption.Hosts[0].DNSResolver = "pod"
networkDisruption.AllowedHosts[0].DNSResolver = "node"

return *networkDisruption
}(),
[]string{
"network-disruption",
"--corrupt",
"3",
"--drop",
"1",
"--duplicate",
"2",
"--delay",
"4",
"--delay-jitter",
"5",
"--bandwidth-limit",
"6",
"--hosts",
"lorem;8080;TCP;ingress;open;pod",
"--allowed-hosts",
"localhost;9090;UDP;egress;closed;node",
"--services",
"name;namespace;9191-default",
},
),
Entry("with DNSResolver empty (backward compatibility)",
func() NetworkDisruptionSpec {
networkDisruption := defaultNetworkDisruption.DeepCopy()
networkDisruption.Hosts[0].DNSResolver = ""
networkDisruption.AllowedHosts[0].DNSResolver = ""

return *networkDisruption
}(),
[]string{
"network-disruption",
"--corrupt",
"3",
"--drop",
"1",
"--duplicate",
"2",
"--delay",
"4",
"--delay-jitter",
"5",
"--bandwidth-limit",
"6",
"--hosts",
"lorem;8080;TCP;ingress;open",
"--allowed-hosts",
"localhost;9090;UDP;egress;closed",
"--services",
"name;namespace;9191-default",
},
))
})
})
Expand Down
16 changes: 16 additions & 0 deletions chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,14 @@ spec:
- est
- ""
type: string
dnsResolver:
enum:
- pod
- node
- pod-fallback-node
- node-fallback-pod
- ""
type: string
flow:
enum:
- ingress
Expand Down Expand Up @@ -407,6 +415,14 @@ spec:
- est
- ""
type: string
dnsResolver:
enum:
- pod
- node
- pod-fallback-node
- node-fallback-pod
- ""
type: string
flow:
enum:
- ingress
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,14 @@ spec:
- est
- ""
type: string
dnsResolver:
enum:
- pod
- node
- pod-fallback-node
- node-fallback-pod
- ""
type: string
flow:
enum:
- ingress
Expand Down Expand Up @@ -408,6 +416,14 @@ spec:
- est
- ""
type: string
dnsResolver:
enum:
- pod
- node
- pod-fallback-node
- node-fallback-pod
- ""
type: string
flow:
enum:
- ingress
Expand Down
16 changes: 16 additions & 0 deletions chart/templates/generated/chaos.datadoghq.com_disruptions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,14 @@ spec:
- est
- ""
type: string
dnsResolver:
enum:
- pod
- node
- pod-fallback-node
- node-fallback-pod
- ""
type: string
flow:
enum:
- ingress
Expand Down Expand Up @@ -398,6 +406,14 @@ spec:
- est
- ""
type: string
dnsResolver:
enum:
- pod
- node
- pod-fallback-node
- node-fallback-pod
- ""
type: string
flow:
enum:
- ingress
Expand Down
1 change: 1 addition & 0 deletions docs/network_disruption.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ If your team has specific disruption requirements around what `protocol` to disr

* [How do I decide my traffic flow? (Ingress vs Egress)](/docs/network_disruption/flow.md)
* [What should I specify in hosts vs services?](/docs/network_disruption/hosts-and-services.md)
* [How do I control DNS resolution for hostnames (e.g., bypassing Istio DNS proxy)?](/docs/network_disruption/hosts-and-services.md#case-5-controlling-dns-resolution-with-dnsresolver)
* [What are `prio` qdiscs and how does the chaos-controller use them?](/docs/network_disruption/prio.md)
* [How are changes in destination pods and services filtered on handled by the chaos-controller?](/docs/changes_handling.md#network-disruption-dynamic-service-resolution)

Expand Down
99 changes: 99 additions & 0 deletions docs/network_disruption/hosts-and-services.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,105 @@ If the `hosts` field contains a CIDR, the routing table is consulted. If the lis
Instead of a CIDR block, hostnames can be provided in the `hosts` field. If the `chaos-controller` fails to resolve the `hosts` field to an IP address or a CIDR block, it then tries to resolve the potential hostname on each resolver listed in `/etc/resolv.conf` in order.
Remember, this hostname must _not_ be a kubernetes service's hostname.

### Case 5: Controlling DNS Resolution with `dnsResolver`

When specifying hostnames in the `hosts` or `allowedHosts` fields, you can control which DNS resolver is used to resolve the hostname to an IP address. This is particularly useful in environments with service mesh proxies (like Istio) that intercept DNS queries and return virtual IPs (VIPs) that may not work for traffic disruption.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make it a little more clear?

Suggested change
When specifying hostnames in the `hosts` or `allowedHosts` fields, you can control which DNS resolver is used to resolve the hostname to an IP address. This is particularly useful in environments with service mesh proxies (like Istio) that intercept DNS queries and return virtual IPs (VIPs) that may not work for traffic disruption.
When specifying hostnames in the `hosts` or `allowedHosts` fields, you can control which DNS resolver is used to resolve the hostname to an IP address. This is particularly useful in environments with service mesh proxies (like Istio) that intercept DNS queries and return virtual IPs (VIPs) that may not work for traffic disruption. In these cases, you should rely on the node-level resolver to ensure the hostnames resolve to the actual destination IPs rather than the service mesh VIP.


The `dnsResolver` field supports the following strategies:

| Strategy | Description |
|---------------------|------------------------------------------------------------------------------------|
| `pod` | Uses only the pod's DNS configuration (`/etc/resolv.conf`) |
| `node` | Uses only the node's DNS configuration (`/mnt/host/etc/resolv.conf`) |
| `pod-fallback-node` | Tries pod DNS first, falls back to node DNS if resolution fails (default behavior) |
| `node-fallback-pod` | Tries node DNS first, falls back to pod DNS if resolution fails |

**Example: Bypassing Istio DNS Proxy in `hosts`**

When Istio DNS proxy is enabled, pod-level DNS lookups may return VIP addresses (Class E subnet: 240.0.0.0/4) that don't work for network disruptions. In this case, use `dnsResolver: node` to bypass the Istio proxy and get the actual service IPs:

```yaml
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we have those examples be defined in the examples/ dir and link them instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really see the added value because the examples are more here to test the disruption effect. Here the resolution of the host will be the same if it's resolved by the node or the pod resolver.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as you want, but we still need to update the complete example to add your new field

apiVersion: chaos.datadoghq.com/v1beta1
kind: Disruption
metadata:
name: network-disruption-istio
spec:
level: pod
selector:
app: my-service
network:
drop: 50
hosts:
- host: external-api.example.com
port: 443
protocol: tcp
dnsResolver: node # Bypasses Istio DNS proxy, gets real IPs
```
**Example: Using `dnsResolver` with `allowedHosts`**

The `dnsResolver` field is also available for `allowedHosts`, which is useful when you want to exclude specific hosts from disruption but need to control how their hostnames are resolved:

```yaml
apiVersion: chaos.datadoghq.com/v1beta1
kind: Disruption
metadata:
name: network-disruption-with-exclusions
spec:
level: pod
selector:
app: my-service
network:
drop: 100
allowedHosts:
- host: critical-internal-service.cluster.local
port: 443
protocol: tcp
dnsResolver: pod # Use pod DNS for internal service
- host: monitoring.example.com
port: 443
protocol: tcp
dnsResolver: node # Use node DNS for external monitoring service
```

**Example: Multiple hosts with different DNS strategies**

```yaml
network:
drop: 50
hosts:
# External service - use node DNS to avoid service mesh VIPs
- host: external-service.example.com
port: 443
protocol: tcp
dnsResolver: node
# Internal cluster service - use pod DNS for cluster-internal resolution
- host: internal-service.cluster.local
port: 8080
protocol: tcp
dnsResolver: pod
# Public API - use default behavior (pod-fallback-node)
- host: api.public.com
port: 443
protocol: tcp
# dnsResolver not specified = uses default "pod-fallback-node"
allowedHosts:
# Critical dependency that should never be disrupted
- host: database.internal.example.com
port: 5432
protocol: tcp
dnsResolver: node # Ensure reliable resolution using node DNS
```

**When to use each strategy:**

- **`node`**: Use when working with service meshes (Istio, Linkerd) that proxy DNS, or when you need to resolve external hostnames using the node's DNS servers (e.g., corporate DNS servers)
- **`pod`**: Use for cluster-internal services or when you specifically want to use the pod's DNS configuration
- **`pod-fallback-node`** (default): Use when you want resilience - try pod DNS first but fall back to node DNS if it fails
- **`node-fallback-pod`**: Use when node DNS is preferred but you want pod DNS as a backup

### Some special cases

Cluster IPs can also be specified to target the relevant pods.
Expand Down
2 changes: 2 additions & 0 deletions examples/complete.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,14 @@ spec:
protocol: tcp # optional, protocol to drop packets on (can be tcp or udp, defaults to both)
flow: ingress # optional, flow direction (egress: outgoing traffic, ingress: incoming traffic, defaults to egress)
connState: new # optional, connection state (new: new connections, est: established connections, defaults to all states)
dnsResolver: pod-fallback-node # optional, DNS resolution strategy (pod, node, pod-fallback-node, node-fallback-pod), defaults to pod-fallback-node. Use 'node' to bypass service mesh DNS proxies like Istio
allowedHosts: # optional, list of excluded hosts which would not be disrupted
- host: 10.0.0.1 # optional, IP, CIDR or hostname to filter on
port: 80 # optional, port to filter on
protocol: tcp # optional, protocol to filter on (can be tcp or udp, defaults to both)
flow: ingress # optional, flow direction (egress: outgoing traffic, ingress: incoming traffic, defaults to egress)
connState: new # optional, connection state (new: new connections, est: established connections, defaults to all states)
dnsResolver: pod-fallback-node # optional, DNS resolution strategy (pod, node, pod-fallback-node, node-fallback-pod), defaults to pod-fallback-node. Use 'node' to bypass service mesh DNS proxies like Istio
services: # optional, list of destination Kubernetes services to filter on. These must be in the same kubernetes cluster
- name: foo # service name
namespace: bar # service namespace
Expand Down
11 changes: 9 additions & 2 deletions injector/ipresolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ import (
// resolveHost tries to resolve the given host
// it tries to resolve it as a CIDR, as a single IP, or as a hostname
// it returns a list of IP or an error if it fails to resolve the hostname
func resolveHost(client network.DNSClient, host string) ([]*net.IPNet, error) {
// dnsStrategy specifies the DNS resolution strategy to use (empty string uses default)
func resolveHost(client network.DNSClient, host string, dnsStrategy string) ([]*net.IPNet, error) {
var ips []*net.IPNet

// return the wildcard 0.0.0.0/0 CIDR if the given host is an empty string
Expand All @@ -33,7 +34,13 @@ func resolveHost(client network.DNSClient, host string) ([]*net.IPNet, error) {
if ip == nil {
// if no IP has been parsed, fallback on a hostname
// and try to resolve it by using the container resolv.conf file
resolvedIPs, err := client.Resolve(host)
var resolvedIPs []net.IP
if dnsStrategy != "" {
resolvedIPs, err = client.ResolveWithStrategy(host, dnsStrategy)
} else {
resolvedIPs, err = client.Resolve(host)
}

if err != nil {
return nil, fmt.Errorf("can't resolve the given host with the configured dns resolver: %w", err)
}
Expand Down
Loading