DevOps/Supervision/Cluster_orchestration/docker_swarm.docker.txt


                                  ┏━━━━━━━━━━━━━━━━━━┓
                                  ┃   DOCKER_SWARM   ┃
                                  ┗━━━━━━━━━━━━━━━━━━┛

SUMMARY ==>                       #Provides, for Docker containers:
                                  #  - load balancing
                                  #  - autoscaling
                                  #  - automatic failover
                                  #  - rolling updates
                                  #  - basic placement
                                  #Divides between managers (who elect a leader) and worker nodes
                                  #Concepts:
                                  #  - swarm: cluster of nodes
                                  #  - node: dockerd server
                                  #  - service: image + command + replication information
                                  #  - task: container execution
                                  #Swarm state is kept in a key-value store

                                  ┌───────────┐
                                  │   SWARM   │
                                  └───────────┘

DOCKER SWARM ==>                  #A former version exists ("Docker Swarm"), as a Docker image instead of being builtin inside Docker ("Swarm mode")
                                  #It is now deprecated because harder to manage with fewer features

DOCKER SWARMKIT ==>               #Underlying project behind "Swarm mode"
                                  #Is lower-level, e.g. not integrated withing Docker, can be used with other container technology


SWARM ==>                         #Cluster of nodes
                                  #Does: load balancing, high availability, basic orchestration


docker swarm init                 #Starts a new single-node swarm
                                  #Current node will be a leader manager
                                  #Fires docker swarm join-token worker
docker swarm init|join ...
--advertise-addr IP[:PORT]        #Each manager has an advertised IP for:
                                  #  - external, load-balanced communication to the swarm
                                  #  - communicating to manager for orchestration purpose
                                  #By def:
                                  #  - dockerd --swarm-default-advertise-addr IP
                                  #  - def: main IP address and port 2377
--listen-addr IP[:PORT]           #Used for inter-manager communication. Def: 0.0.0.0:2377

docker swarm init ...
--force-new-cluster               #Forces even if there is currently one
                                  #Useful when quorum was lost, and needs to reboot swarm without losing information like services, tasks, nodes
--cert-expiry [NUMh][NUMm][NUMs]  #Swarm communication is encrypted: how long to rotate certificates (def: 3 months)
--external-ca protocol=VAL,url=URL#Use external CA for certificates


docker swarm update               #Sets swarm options.
                                  #Same options as docker swarm init except:
                                  #  - no --advertise|listen-addr, --force-new-cluster


SWARM_TOKEN                       #Confidential token specific to a swarm, allowing nodes to join it
                                  #There is one for workers, and one for managers
                                  #Can be rotated


docker swarm join HOST:PORT       #Make current node a worker node of a swarm
                                  #HOST is leader's
                                  #PORT is usually 2377
--token SWARM_TOKEN               #


docker swarm join-token           
 worker|manager                   #Prints SWARM_TOKEN and instructions on how to use docker swarm join
-q
--quiet                           #Only prints token
--rotate                          #Changes SWARM_TOKEN


docker swarm leave                #Inverse of docker swarm join.
-f
--force                           #Even if manager. Unsafe, should demote first.


VOLUMEINFO|NETWORKINFO.Scope      #Whether a VOLUME|NETWORK is available in whole swarm ("global") or only on one node ("local", def)


NODEINFO|SERVICEINFO|TASKINFO.
 Version.Index                    #Version number, shared across the whole swarm, which increments everytime NODE state changes
NODEINFO.Status.State             #Whether node information, shared across the whole swarm, is available.
NODEINFO.ManagerStatus.
 Reachability                     #Same for the manager of the swarm

                                  ┌────────────────┐
                                  │   SWARM NODE   │
                                  └────────────────┘

NODE ==>                          #Is a dockerd server
                                  #Can be:
                                  #  - manager: 
                                  #     - manages workers:
                                  #        - orchestration
                                  #        - assigning tasks to nodes
                                  #     - managers must be connected to each other, to be able to tell each others' state:
                                  #        - must use datastore for this, if it is multi-host
                                  #  - worker: 
                                  #     - execute a task
                                  #     - managers are also workers:
                                  #        - unless availability is "drained"
                                  #        - should use resource constraints to ensure tasks do not keep manager so busy it cannot do orchestration
                                  #Leader:
                                  #  - single manager that execute operations
                                  #     - other managers only forward to leader
                                  #  - election/quorum:
                                  #     - other managers only purpose is to possibly be elected to leader:
                                  #        - more increase fault tolerance
                                  #        - but more decrease performance
                                  #        - usually 3, 5 or 7
                                  #     - triggered when leader is down
                                  #        - uses a raft consensus determines a new leader, i.e. (n/2)+1 majority
                                  #        - for it to work, there should be odd number of managers
                                  #     - also used on add|remove nodes or swarm updates
                                  #     - can be customized (e.g. heartbeat), but only through REST API
                                  #  - swarm is down when there is less than (n-1)/2 (because election cannot happen)
                                  #     - "down" means there is no leader, i.e.:
                                  #        - cannot manage workers (assigning tasks, orchestration)
                                  #        - but workers keep running
                                  #Those ports must be opened:
                                  #  - 2377 (TCP): orchestration
                                  #  - 7946 (TCP+UDP): inter-nodes orchestration
                                  #  - 4789 (TCP+UDP): overlay network


docker node promote|demote NODE...#Sets NODE to manager or worker

docker node update NODE           #Sets nodes properties
--role worker|manager             #Same as docker node promote|demote
SYSINFO.Swarm."Is Manager"        
NODEINFO.Spec.Role                
NODEINFO.ManagerStatus.Leader     #
SYSINFO.Swarm.Managers            #NUM

docker node update ...
--availability STR                #Can be:
NODEINFO.Spec.Availability        #  - active: running, can get new tasks
                                  #  - pause: running, cannot get new tasks
                                  #  - drain: 
                                  #     - not running, cannot get new tasks
                                  #     - when switching, currently running tasks are transfered to other nodes


docker node rm NODE...            #
--force                           #Forces even if availability is "active"


docker node ls                    #List available nodes
-q
--quiet                           #Only prints IDs
NODEINFO.ID                       #NODE_ID
SYSINFO.Swarm.NodeID              #
SYSINFO.Swarm.Nodes               #NUM
NODEINFO.CreatedAt|UpdatedAt      #"DATE"
-f VAR=VAL                        #Possible VAR:
--filter VAR=VAL                  #  - id NODE_ID
                                  #  - name NODE_NAME
                                  #  - label VAR[=VAL]


docker node inspect NODE|self...  #Print debug information, as NODEINFO
-f GOTMP
--format GOTMP                    #
--pretty                          #Tab spaced, instead of JSON

                                  ┌───────────────────┐
                                  │   SWARM SERVICE   │
                                  └───────────────────┘

SERVICE ==>                       #Template of a task, i.e. an IMAGE + COMMAND + configuration (e.g. number of containers)

SERVICE                           #Either SERVICE_ID (can be truncated) or SERVICE_NAME


docker service create IMAGE       
 [COMMAND [ARGS...]]              #Creates a swarm service, and launches containers on available nodes to support it (if replicas > 0)
-e
--env
-l
--label
--log-driver
--log-opt
--network
-p
--publish
-u
--user
-w
--workdir                         #Like docker create

--name SERVICE_NAME               #

--replicas NUM                    #Number of tasks (i.e. containers) (def: 0), running same IMAGE in parallel
SERVICEINFO.Spec.Mode.Replicated. #Will be enforced, e.g. new tasks will be started if one task terminates or fails healthchecks
 Replicas                         #Will be balanced:
                                  #  - try to maintain about the same number of tasks per node
                                  #  - only do so when starting new tasks, i.e. does not rebalance tasks when starting new nodes
TASKINFO.Slot                     #Index NUM, when there are several replicas
--mode replicated|global          #If global (def: replicated), --replicas NUM is "1 per node"
SERVICEINFO.Mode.Global           #Good for services that must be run on every node, usually not main task, e.g. monitoring
--endpoint-mode STR               #Internal load balancing configuration between TASKS, used for replicas
SERVICEINFO.Spec.EndpointSpec     #Can be:
SERVICEINFO.Endpoint.Spec         #  - vip (def): virtual IP load balancing, i.e. IP-based
                                  #  - dnsrr: DNS round robin, i.e. DNS based

--update-delay TIME               #Delay between updates (docker service update)
SERVICINFO.Spec.UpdateConfig.Delay#Apply updates to all nodes at same time (as opposed to def serial)
                                  #TIME is [NUMh][NUMm][NUMs]
--update-parallelism              
SERVICEINFO.Spec.UpdateConfig.
 Parallelism                      #Apply updates to all nodes at same time (as opposed to def serial)
--update-failure pause|continue   
SERVICEINFO.Spec.UpdateConfig.    #When an update on a single node fails, pauses (def) updates on all nodes
 FailureAction                    #Running docker service update will restart node

--constraint "VAR !=|== STR"      #Only launch tasks on nodes that satisfy constraint. VAR:
TASKTMPINFO.Placement             #  - node.id NODE_ID
                                  #  - node.hostname
                                  #  - node.role manager|worker
                                  #  - node|engine.labels.VAR VAL

docker service rm SERVICE...      #Removes SERVICE


docker service update SERVICE     #Sets SERVICE options.
                                  #Same options as docker service create but:
                                  #  - --env|label|publish|container-label|mount -> --env|label|publish|container-label|constraint|mount-add|rm
                                  #  - IMAGE [COMMAND [ARGS...]] -> --image IMAGE, --args ...
                                  #  - no --network
                                  #It will restart containers:
                                  #  - for rolling updates, see --update-delay|parallelism
SERVICEINFO.UpdateStatus.
 StartedAt|CompletedAt            #"DATE"


docker service scale SERVIC=NUM...#Changes number of containers, e.g. to scale up|down or to stop
                                  #Same as updating --replicas


docker service ls                 #Prints all SERVICE: ID, Name, Replicas, Image, Command
-q
--quiet                           #Only print SERVICE_ID
-f VAR=VAL                        #Possible VAR:
--filter VAR=VAL                  #  - id SERVICE_ID
                                  #  - name SERVICE_NAME
                                  #  - label VAR[=VAL]
SERVICEINFO.ID                    #SERVICE_ID
SERVICEINFO.Spec.Name             #SERVICE_NAME
SERVICEINFO.CreatedAt|UpdatedAt   #"DATE"


docker service inspect SERVICE... #Gives debug information about SERVICE, as SERVICEINFO
SERVICEINFO                       #
-f GOTMP
--format GOTMP                    #
--pretty                          #Tab spaced, instead of JSON

                                  ┌────────────────┐
                                  │   SWARM TASK   │
                                  └────────────────┘

TASK ==>                          #Task:
                                  #  - a CONTAINER execution
                                  #  - once assigned, cannot move to another node
                                  #  - automatically load balanced between nodes

TASK                              #Either TASK_ID or TASK_NAME


docker inspect TASK               #Show debugging info, as TASKINFO
TASKINFO.Spec                     #TASKTMPINFO
SERVICEINFO.Spec.TaskTemplate     
TASKTMPINFO.ContainerSpec         #CONTAINERSPEC
-f GOTMP
--format GOTMP                    #
--type image|container|task       #If there is ambiguity

docker node ps NODE|self          #Show tasks for that SERVICE|NODE: TASK_ID, TASK_NAME, IMAGE, current|desired state, NODE, errors
docker service ps SERVICE         #Also show previously launched TASK of same SERVICE and same NODE
TASKINFO.ServiceID                #SERVICE_ID
TASKINFO.ID                       #TASK_ID
TASKINFO.NodeID                   #NODE_ID
TASKINFO.Status.State             #Current state
TASKINFO.DesiredState             #Desired state: running|shutdown|accepted
TASKINFO.Status.Message           #Log message about current state
TASKINFO.Status.Timestamp         #"DATE" of last status check
TASKINFO.CreatedAt|UpdatedAt      #"DATE"
--no-resolve                      #TASK_NAME has two form: a human-friendly (shown by default) or machine (shown with this option)
-f VAR=VAL                        #Possible VAR:
--filter VAR=VAL                  #  - id TASK_ID
                                  #  - name TASK_NAME
                                  #  - label VAR[=VAL]
                                  #  - desired-state STR
docker swarm ...
--task-history-limit NUM          #How many previously launched TASK to show (def: 5)

TASKINFO.Status.ContainerStatus.
 ContainerID                      #CONTAINER_ID
TASKINF.Status.ContainerStatus.PID#