Skip to content

[ovn-controller] Change startup mechanism of ovs pods #423

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion controllers/ovncontroller_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,6 @@ func (r *OVNControllerReconciler) reconcileNormal(ctx context.Context, instance
instance.Status.DesiredNumberScheduled = dset.GetDaemonSet().Status.DesiredNumberScheduled
instance.Status.NumberReady = dset.GetDaemonSet().Status.NumberReady

// Define a new DaemonSet object for OVS (ovsdb-server + ovs-vswitchd)
ovsdset := daemonset.NewDaemonSet(
ovncontroller.CreateOVSDaemonSet(instance, inputHash, ovsServiceLabels, serviceAnnotations, topology),
time.Duration(5)*time.Second,
Expand Down Expand Up @@ -744,6 +743,11 @@ func (r *OVNControllerReconciler) generateServiceConfigMaps(
} else {
templateParameters["OVNEncapNIC"] = "eth0"
}
if instance.Spec.TLS.Enabled() {
templateParameters["TLS"] = "Enabled"
} else {
templateParameters["TLS"] = "Disabled"
}
cms := []util.Template{
// ScriptsConfigMap
{
Expand Down
41 changes: 37 additions & 4 deletions pkg/ovncontroller/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

Expand Down Expand Up @@ -236,6 +237,27 @@ func CreateOVSDaemonSet(
envVars := map[string]env.Setter{}
envVars["CONFIG_HASH"] = env.SetValue(configHash)

volumes := []corev1.Volume{}
mounts := []corev1.VolumeMount{}

// add OVN dbs cert and CA
if instance.Spec.TLS.Enabled() {
svc := tls.Service{
SecretName: *instance.Spec.TLS.GenericService.SecretName,
CertMount: ptr.To(ovn_common.OVNDbCertPath),
KeyMount: ptr.To(ovn_common.OVNDbKeyPath),
CaMount: ptr.To(ovn_common.OVNDbCaCertPath),
}
volumes = append(volumes, svc.CreateVolume(ovnv1.ServiceNameOVS))
mounts = append(mounts, svc.CreateVolumeMounts(ovnv1.ServiceNameOVS)...)

// add CA bundle if defined
if instance.Spec.TLS.CaBundleSecretName != "" {
volumes = append(volumes, instance.Spec.TLS.CreateVolume())
mounts = append(mounts, instance.Spec.TLS.CreateVolumeMounts(nil)...)
}
}

initContainers := []corev1.Container{
{
Name: "ovsdb-server-init",
Expand All @@ -250,7 +272,7 @@ func CreateOVSDaemonSet(
Privileged: &privileged,
},
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
VolumeMounts: GetOVSDbVolumeMounts(),
VolumeMounts: append(GetOVSDbVolumeMounts(), mounts...),
},
}

Expand All @@ -276,7 +298,7 @@ func CreateOVSDaemonSet(
Privileged: &privileged,
},
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
VolumeMounts: GetOVSDbVolumeMounts(),
VolumeMounts: append(GetOVSDbVolumeMounts(), mounts...),
// TODO: consider the fact that resources are now double booked
Resources: instance.Spec.Resources,
LivenessProbe: ovsDbLivenessProbe,
Expand All @@ -303,7 +325,7 @@ func CreateOVSDaemonSet(
Privileged: &privileged,
},
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
VolumeMounts: GetVswitchdVolumeMounts(),
VolumeMounts: append(GetVswitchdVolumeMounts(), mounts...),
// TODO: consider the fact that resources are now double booked
Resources: instance.Spec.Resources,
LivenessProbe: ovsVswitchdLivenessProbe,
Expand All @@ -312,6 +334,9 @@ func CreateOVSDaemonSet(
},
}

maxUnavailable := intstr.FromInt32(0)
maxSurge := intstr.FromInt32(1)

daemonset := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: ovnv1.ServiceNameOVS,
Expand All @@ -327,9 +352,17 @@ func CreateOVSDaemonSet(
},
Spec: corev1.PodSpec{
ServiceAccountName: instance.RbacResourceName(),
HostPID: true,
InitContainers: initContainers,
Containers: containers,
Volumes: GetOVSVolumes(instance.Name, instance.Namespace),
Volumes: append(GetOVSVolumes(instance.Name, instance.Namespace), volumes...),
},
},
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
Type: appsv1.RollingUpdateDaemonSetStrategyType,
RollingUpdate: &appsv1.RollingUpdateDaemonSet{
MaxUnavailable: &maxUnavailable,
MaxSurge: &maxSurge,
},
},
},
Expand Down
68 changes: 62 additions & 6 deletions templates/ovncontroller/bin/init-ovsdb-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,71 @@ set -ex
source $(dirname $0)/functions
trap wait_for_db_creation EXIT

function init_ovsdb_server {
# Initialize or upgrade database if needed
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS

if [ ! -f /var/lib/openvswitch/already_executed ]; then
# If file was not present, set status INIT
echo "INIT" > /var/lib/openvswitch/already_executed
fi

wait_for_db_creation
trap - EXIT
}

# If db file is empty, remove it; otherwise service won't start.
# See https://issues.redhat.com/browse/FDP-689 for more details.
if ! [ -s ${DB_FILE} ]; then
rm -f ${DB_FILE}
fi
# Initialize or upgrade database if needed
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS

wait_for_db_creation
trap - EXIT
# Check if file is created, if not means it's first execution
if [ -f /var/lib/openvswitch/already_executed ]; then
if [ {{ .TLS }} == "Enabled" ]; then
# TLS is used
TLSOptions="--certificate=/etc/pki/tls/certs/ovndb.crt --private-key=/etc/pki/tls/private/ovndb.key --ca-cert=/etc/pki/tls/certs/ovndbca.crt"
DBOptions="--db ssl:ovsdbserver-nb.openstack.svc.cluster.local:6641"
else
# Normal TCP is used
TLSOptions=""
DBOptions="--db tcp:ovsdbserver-nb.openstack.svc.cluster.local:6641"
fi

# Need to double check that ovsdb-server and vswitchd are actually running
# (That pod was not unhealty and it got destroyed)
# In the following steps we need ovsdb-server to be running, check pid file
if [ ! -f /run/openvswitch/ovsdb-server.pid ]; then
# No PID file, start as normal
echo "No PID file found, init ovsdb_server as it's the only pod"
init_ovsdb_server
exit 0
fi
# File is created, no need to run ovs-ctl
# Change state to "UPDATE"
echo "UPDATE" > /var/lib/openvswitch/already_executed
# Clear possible leftovers of past executions
## Need to lower chassis priority
# First get the system-id
chassis_id=$(ovs-vsctl get Open_Vswitch . external_ids:system-id)
nb_output=$(ovn-nbctl --no-leader-only $DBOptions $TLSOptions --columns=_uuid,priority find Gateway_Chassis chassis_name=$chassis_id)
# Check that nbctl was executed correctly
if [ $? -ne 0 ]; then
echo "ERROR: ovn-nbctl find command failed"
exit 1
fi
row_uuid=$(echo "$nb_output" | grep "_uuid" | cut -d':' -f2 | xargs)
priority=$(echo "$nb_output" | grep "priority" | cut -d':' -f2 | xargs)
# Save priority to be able to restore it later (It's overwritting, not appending, hence no check)
echo $priority > /var/lib/openvswitch/old_priority
# Set lower priority (lowest value possible 0)
ovn-nbctl --no-leader-only $DBOptions $TLSOptions set Gateway_Chassis $row_uuid priority=0
# Check that nbctl was executed correctly
if [ $? -ne 0 ]; then
echo "ERROR: ovn-nbctl set command failed"
exit 1
fi
exit 0
fi
41 changes: 40 additions & 1 deletion templates/ovncontroller/bin/start-ovsdb-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,51 @@
set -ex
source $(dirname $0)/functions

echo "start ovsdb-server"

# Check state
if [ -f /var/lib/openvswitch/already_executed ]; then
if [ $(cat /var/lib/openvswitch/already_executed) == "UPDATE" ]; then
echo "In a middle of an upgrade"
# Need to stop vswitch and dbserver
# First stop vswitchd
vswitchd_pid=$(cat /run/openvswitch/ovs-vswitchd.pid)
# Stop vswitch
echo "stopping vswitchd"
bash /usr/local/bin/container-scripts/stop-vswitchd.sh
echo "Done, stopped vswitchd"
# Wait for vswitchd to end checking status
while true; do
if [ $(cat /var/lib/openvswitch/already_executed) == "RESTART_VSWITCHD" ]; then
break
fi
sleep 0.1
done
echo "Status is already RESTART_VSWITCHD"
bash /usr/local/bin/container-scripts/stop-ovsdb-server.sh
echo "Done, stopped ovsdb-server"
# vswitchd stopped
# We still need to run the ovsdb-server in this new container, this can be done
# with the flag --overwrite-pidfile but we need to ignore the next SIGTERM that will
# send openshift, creating file to noop the stop-ovsdb-server.sh
echo "setting flag to skip ovsdb-server stop"
touch /var/lib/openvswitch/skip_stop_ovsdbserver
else
# It could happen that ovsdb-server or ovs-vwsitchd pod can't start correctly or can't get to running state
# this would cause this script to be run with already_executed with an state different than "UPDATE"
:
fi
fi

# Remove the obsolete semaphore file in case it still exists.
cleanup_ovsdb_server_semaphore

# Set state to "OVSDB_SERVER"
echo "OVSDB_SERVER" > /var/lib/openvswitch/already_executed

# Start the service
ovsdb-server ${DB_FILE} \
--pidfile \
--pidfile --overwrite-pidfile \
--remote=punix:/var/run/openvswitch/db.sock \
--private-key=db:Open_vSwitch,SSL,private_key \
--certificate=db:Open_vSwitch,SSL,certificate \
Expand Down
53 changes: 52 additions & 1 deletion templates/ovncontroller/bin/start-vswitchd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,33 @@
# License for the specific language governing permissions and limitations
# under the License.

# Check which connection is used
if [ {{ .TLS }} == "Enabled" ]; then
# TLS is used
TLSOptions="--certificate=/etc/pki/tls/certs/ovndb.crt --private-key=/etc/pki/tls/private/ovndb.key --ca-cert=/etc/pki/tls/certs/ovndbca.crt"
DBOptions="--db ssl:ovsdbserver-nb.openstack.svc.cluster.local:6641"
else
# Normal TCP is used
TLSOptions=""
DBOptions="--db tcp:ovsdbserver-nb.openstack.svc.cluster.local:6641"
fi


# Check if we're doing an update
echo "Check if we're doing an update"
if [ -f /var/lib/openvswitch/already_executed ]; then
while true; do
if [ $(cat /var/lib/openvswitch/already_executed) == "OVSDB_SERVER" ]; then
break
fi
if [ $(cat /var/lib/openvswitch/already_executed) == "INIT" ]; then
break
fi
sleep 0.1
done
echo "OVSDBSERVER Already up, start script"
fi

source $(dirname $0)/functions
wait_for_ovsdb_server

Expand All @@ -30,7 +57,7 @@ ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait=true

# It's safe to start vswitchd now. Do it.
# --detach to allow the execution to continue to restoring the flows.
/usr/sbin/ovs-vswitchd --pidfile --mlockall --detach
/usr/sbin/ovs-vswitchd --pidfile --overwrite-pidfile --mlockall --detach

# Restore saved flows.
if [ -f $FLOWS_RESTORE_SCRIPT ]; then
Expand All @@ -49,6 +76,30 @@ cleanup_flows_backup
# Now, inform vswitchd that we are done.
ovs-vsctl remove open_vswitch . other_config flow-restore-wait

# Restore the priority if this was changed during the update
if [ -f /var/lib/openvswitch/old_priority ]; then
echo "Using DBOptions: $DBOptions"
echo "Using TLSOptions: $TLSOptions"
priority=$(cat /var/lib/openvswitch/old_priority)
echo "Restoring old priority, which was: $priority"
chassis_id=$(ovs-vsctl get Open_Vswitch . external_ids:system-id)
nb_output=$(ovn-nbctl --no-leader-only $DBOptions $TLSOptions --columns=_uuid,priority find Gateway_Chassis chassis_name=$chassis_id)
err=$?
if [ $err -ne 0 ]; then
echo "Error while getting gateway chassis uuid $err"
fi
row_uuid=$(echo "$nb_output" | grep "_uuid" | cut -d':' -f2 | xargs)
rm /var/lib/openvswitch/old_priority
ovn-nbctl --no-leader-only $DBOptions $TLSOptions set Gateway_Chassis $row_uuid priority=$priority
err=$?
if [ $err -ne 0 ]; then
echo "Error while setting gateway chassis priority ($priority), error: $err"
fi
fi

# Set state to "RUNNING"
echo "RUNNING" > /var/lib/openvswitch/already_executed

# This is container command script. Block it from exiting, otherwise k8s will
# restart the container again.
sleep infinity
14 changes: 14 additions & 0 deletions templates/ovncontroller/bin/stop-ovsdb-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
# License for the specific language governing permissions and limitations
# under the License.

if [ -f /var/lib/openvswitch/skip_stop_ovsdbserver ]; then
echo "Skipping stop script"
rm /var/lib/openvswitch/skip_stop_ovsdbserver
exit 0
fi

set -ex
source $(dirname $0)/functions

Expand All @@ -26,5 +32,13 @@ while [ ! -f $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE ]; do
done
cleanup_ovsdb_server_semaphore

while [ $(cat /var/log/openvswitch/already_executed) != "RESTART_VSWITCHD" ]; do
# Wait to vswitchd container to finish it's stop process
sleep 0.1
done

# Now it's safe to stop db server. Do it.
/usr/share/openvswitch/scripts/ovs-ctl stop --no-ovs-vswitchd

# Update state to "RESTART_DBSERVER"
echo "RESTART_DBSERVER" > /var/lib/openvswitch/already_executed
16 changes: 16 additions & 0 deletions templates/ovncontroller/bin/stop-vswitchd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
# License for the specific language governing permissions and limitations
# under the License.

if [ -f /var/lib/openvswitch/skip_stop_vswitchd ]; then
echo "Skipping stop script"
rm /var/lib/openvswitch/skip_stop_vswitchd
exit 0
fi
set -ex
source $(dirname $0)/functions

Expand All @@ -32,5 +37,16 @@ TMPDIR=$FLOWS_RESTORE_DIR /usr/share/openvswitch/scripts/ovs-save save-flows $br
# unlocks the db preStop script, working as a semaphore
touch $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE

# If it's comming from an update, it means that the
# new pod, hence we're still missing the signal from the openshift. To avoid
# running this script twice, create file to skip the following SIGTERM signal
if [ -f /var/lib/openvswitch/already_executed ]; then
if [ $(cat /var/lib/openvswitch/already_executed) == "UPDATE" ]; then
touch /var/lib/openvswitch/skip_stop_vswitchd
fi
fi
# Update state to "RESTART_VSWITCHD"
echo "RESTART_VSWITCHD" > /var/lib/openvswitch/already_executed

# Finally, stop vswitchd.
/usr/share/openvswitch/scripts/ovs-ctl stop --no-ovsdb-server