Question 25 - Etcd Backup and Restore

Question 25 | Etcd Snapshot Save and Restore

Use context: kubectl config use-context k8s-c3-CCC

Make a backup of etcd running on cluster3-controlplane1 and save it on the controlplane node at /tmp/etcd-backup.db.

Then create any kind of Pod in the cluster.

Finally restore the backup, confirm the cluster is still working and that the created Pod is no longer with us.

To backup etcd we'll use the etcdctl command inside the control-plane. We need to connect etcdctl by passing the certificate, endpoint and key.

kubectl config use-context k8s-c3-CCC
ssh cluster3-controlplane1

# To get etcd information, we can check the etcd manifest or the api-server manifest that communicates with etcd.

root@cluster3-controlplane1:~# cat /etc/kubernetes/manifests/etcd.yaml | grep -e "- --"
    - --advertise-client-urls=https://192.168.100.31:2379
    - --cert-file=/etc/kubernetes/pki/etcd/server.crt                           # use
    - --client-cert-auth=true
    - --data-dir=/var/lib/etcd
    - --initial-advertise-peer-urls=https://192.168.100.31:2380
    - --initial-cluster=cluster3-controlplane1=https://192.168.100.31:2380
    - --key-file=/etc/kubernetes/pki/etcd/server.key                            # use
    - --listen-client-urls=https://127.0.0.1:2379,https://192.168.100.31:2379  # use
    - --listen-metrics-urls=http://127.0.0.1:2381
    - --listen-peer-urls=https://192.168.100.31:2380
    - --name=cluster3-controlplane1
    - --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
    - --peer-client-cert-auth=true
    - --peer-key-file=/etc/kubernetes/pki/etcd/peer.key
    - --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt                    # use
    - --snapshot-count=10000
    - --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt

root@cluster3-controlplane1:~# cat /etc/kubernetes/manifests/kube-apiserver.yaml | grep etcd
    - --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt
    - --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt
    - --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key
    - --etcd-servers=https://127.0.0.1:2379

# Now we run the command to save the snapshot in etcd. etcdctl should already be installed on the machine as we can't access outside to go to github and install etcd. If it's not there they need to provide installation instructions.

# --endpoints is not necessary to pass as the value is already the default. It would only be necessary if etcd was external, which probably won't happen in the exam.

root@cluster3-controlplane1:~# ETCDCTL_API=3 etcdctl snapshot save /tmp/etcd-backup.db --endpoints 127.0.0.1:2379  --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key

Snapshot saved at /tmp/etcd-backup.db

# This step is not necessary but in case something goes wrong it's good to make a larger backup to restore etcd if needed

tar -zcvf etcd.tar.gz /etc/kubernetes/pki/etcd

After backing up etcd we can create any pod and then do the restore to see if it will disappear.

kubectl run pod-test --image=nginx
kubectl get pods -l run=test
NAME   READY   STATUS    RESTARTS   AGE
test   1/1     Running   0          60s

# Now just do the restore of the backup. Before doing the restore, let's stop all cluster components, change the directory

# Before doing etcd backup
# If we do this restore we'll get this output
root@cluster3-controlplane1:~# ETCDCTL_API=3 etcdctl snapshot restore /tmp/etcd-backup.db \
--endpoints 127.0.0.1:2379 \
--cacert /etc/kubernetes/pki/etcd/ca.crt \
--cert /etc/kubernetes/pki/etcd/server.crt \
--key /etc/kubernetes/pki/etcd/server.key
#Error: data-dir "default.etcd" exists

# Let's do it pointing to another location, but we need to change the mount point in /etc/kubernetes/manifests/etcd.yaml
root@cluster3-controlplane1:~# ETCDCTL_API=3 etcdctl snapshot restore /tmp/etcd-backup.db --data-dir /var/lib/etcd-backup --endpoints 127.0.0.1:2379  --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key

root@kind-cluster-ia-control-plane:~# ls -lha /var/lib/etcd-backup/
total 12K
drwx------  3 root root 4.0K Apr 13 11:49 .
drwxr-xr-x 17 root root 4.0K Apr 13 11:49 ..
drwx------  4 root root 4.0K Apr 13 11:49 member

# Now let's edit our etcd.yaml
vim /etc/kubernetes/manifests/etcd.yaml

Now we're going to edit etcd.yaml.

# /etc/kubernetes/manifests/etcd.yaml
apiVersion: v1
kind: Pod
metadata:
  annotations:
    kubeadm.kubernetes.io/etcd.advertise-client-urls: https://172.18.0.8:2379
  creationTimestamp: null
  labels:
    component: etcd
    tier: control-plane
  name: etcd
  namespace: kube-system
spec:
  containers:
  - command:
    - etcd
    - --advertise-client-urls=https://172.18.0.8:2379
    - --cert-file=/etc/kubernetes/pki/etcd/server.crt
    - --client-cert-auth=true
    - --data-dir=/var/lib/etcd # THIS IS COMING FROM THE MOUNT POINT, NO NEED TO CHANGE
    - --experimental-initial-corrupt-check=true
    - --experimental-watch-progress-notify-interval=5s
    - --initial-advertise-peer-urls=https://172.18.0.8:2380
    - --initial-cluster=kind-cluster-ia-control-plane=https://172.18.0.8:2380
    - --key-file=/etc/kubernetes/pki/etcd/server.key
    - --listen-client-urls=https://127.0.0.1:2379,https://172.18.0.8:2379
    - --listen-metrics-urls=http://127.0.0.1:2381
    - --listen-peer-urls=https://172.18.0.8:2380
    - --name=kind-cluster-ia-control-plane
    - --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
    - --peer-client-cert-auth=true
    - --peer-key-file=/etc/kubernetes/pki/etcd/peer.key
    - --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
    - --snapshot-count=10000
    - --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
    image: registry.k8s.io/etcd:3.5.10-0
    imagePullPolicy: IfNotPresent
    livenessProbe:
      failureThreshold: 8
      httpGet:
        host: 127.0.0.1
        path: /health?exclude=NOSPACE&serializable=true
        port: 2381
        scheme: HTTP
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    name: etcd
    resources:
      requests:
        cpu: 100m
        memory: 100Mi
    startupProbe:
      failureThreshold: 24
      httpGet:
        host: 127.0.0.1
        path: /health?serializable=false
        port: 2381
        scheme: HTTP
      initialDelaySeconds: 10
      periodSeconds: 10
      timeoutSeconds: 15
    volumeMounts:
    - mountPath: /var/lib/etcd
      name: etcd-data
    - mountPath: /etc/kubernetes/pki/etcd
      name: etcd-certs
  hostNetwork: true
  priority: 2000001000
  priorityClassName: system-node-critical
  securityContext:
    seccompProfile:
      type: RuntimeDefault
  volumes:
  - hostPath:
      path: /etc/kubernetes/pki/etcd
      type: DirectoryOrCreate
    name: etcd-certs
  - hostPath:
      #path: /var/lib/etcd # old
      path: /var/lib/etcd-backup # CHANGE THE MOUNT POINT TO THE NEW ONE
      type: DirectoryOrCreate
    name: etcd-data
status: {}

After doing this, things need to adjust in the system automatically, but if something happens we can restart the static pods by removing and putting them back in place

cd /etc/kubernetes/manifests
mv * ..

# check if there are no more containers running. If there are, you can kill them.
crictl ps
crictl rm podid --force # To remove the container if it exists

mv ../*.yaml .

# to monitor the pods coming back.
watch crictl ps

If you want to check, see that the test pod should no longer exist.

Question 25 | Etcd Snapshot Save and Restore​

Question 25 | Etcd Snapshot Save and Restore