Question 25 - Etcd Backup and Restore
Question 25 | Etcd Snapshot Save and Restore
Use context: kubectl config use-context k8s-c3-CCC
Make a backup of etcd running on cluster3-controlplane1 and save it on the controlplane node at /tmp/etcd-backup.db.
Then create any kind of Pod in the cluster.
Finally restore the backup, confirm the cluster is still working and that the created Pod is no longer with us.
To backup etcd we'll use the etcdctl command inside the control-plane. We need to connect etcdctl by passing the certificate, endpoint and key.
kubectl config use-context k8s-c3-CCC
ssh cluster3-controlplane1
# To get etcd information, we can check the etcd manifest or the api-server manifest that communicates with etcd.
root@cluster3-controlplane1:~# cat /etc/kubernetes/manifests/etcd.yaml | grep -e "- --"
- --advertise-client-urls=https://192.168.100.31:2379
- --cert-file=/etc/kubernetes/pki/etcd/server.crt # use
- --client-cert-auth=true
- --data-dir=/var/lib/etcd
- --initial-advertise-peer-urls=https://192.168.100.31:2380
- --initial-cluster=cluster3-controlplane1=https://192.168.100.31:2380
- --key-file=/etc/kubernetes/pki/etcd/server.key # use
- --listen-client-urls=https://127.0.0.1:2379,https://192.168.100.31:2379 # use
- --listen-metrics-urls=http://127.0.0.1:2381
- --listen-peer-urls=https://192.168.100.31:2380
- --name=cluster3-controlplane1
- --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
- --peer-client-cert-auth=true
- --peer-key-file=/etc/kubernetes/pki/etcd/peer.key
- --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt # use
- --snapshot-count=10000
- --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
root@cluster3-controlplane1:~# cat /etc/kubernetes/manifests/kube-apiserver.yaml | grep etcd
- --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt
- --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt
- --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key
- --etcd-servers=https://127.0.0.1:2379
# Now we run the command to save the snapshot in etcd. etcdctl should already be installed on the machine as we can't access outside to go to github and install etcd. If it's not there they need to provide installation instructions.
# --endpoints is not necessary to pass as the value is already the default. It would only be necessary if etcd was external, which probably won't happen in the exam.
root@cluster3-controlplane1:~# ETCDCTL_API=3 etcdctl snapshot save /tmp/etcd-backup.db --endpoints 127.0.0.1:2379 --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key
Snapshot saved at /tmp/etcd-backup.db
# This step is not necessary but in case something goes wrong it's good to make a larger backup to restore etcd if needed
tar -zcvf etcd.tar.gz /etc/kubernetes/pki/etcd
After backing up etcd we can create any pod and then do the restore to see if it will disappear.
kubectl run pod-test --image=nginx
kubectl get pods -l run=test
NAME READY STATUS RESTARTS AGE
test 1/1 Running 0 60s
# Now just do the restore of the backup. Before doing the restore, let's stop all cluster components, change the directory
# Before doing etcd backup
# If we do this restore we'll get this output
root@cluster3-controlplane1:~# ETCDCTL_API=3 etcdctl snapshot restore /tmp/etcd-backup.db \
--endpoints 127.0.0.1:2379 \
--cacert /etc/kubernetes/pki/etcd/ca.crt \
--cert /etc/kubernetes/pki/etcd/server.crt \
--key /etc/kubernetes/pki/etcd/server.key
#Error: data-dir "default.etcd" exists
# Let's do it pointing to another location, but we need to change the mount point in /etc/kubernetes/manifests/etcd.yaml
root@cluster3-controlplane1:~# ETCDCTL_API=3 etcdctl snapshot restore /tmp/etcd-backup.db --data-dir /var/lib/etcd-backup --endpoints 127.0.0.1:2379 --cacert /etc/kubernetes/pki/etcd/ca.crt --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key
root@kind-cluster-ia-control-plane:~# ls -lha /var/lib/etcd-backup/
total 12K
drwx------ 3 root root 4.0K Apr 13 11:49 .
drwxr-xr-x 17 root root 4.0K Apr 13 11:49 ..
drwx------ 4 root root 4.0K Apr 13 11:49 member
# Now let's edit our etcd.yaml
vim /etc/kubernetes/manifests/etcd.yaml
Now we're going to edit etcd.yaml.
# /etc/kubernetes/manifests/etcd.yaml
apiVersion: v1
kind: Pod
metadata:
annotations:
kubeadm.kubernetes.io/etcd.advertise-client-urls: https://172.18.0.8:2379
creationTimestamp: null
labels:
component: etcd
tier: control-plane
name: etcd
namespace: kube-system
spec:
containers:
- command:
- etcd
- --advertise-client-urls=https://172.18.0.8:2379
- --cert-file=/etc/kubernetes/pki/etcd/server.crt
- --client-cert-auth=true
- --data-dir=/var/lib/etcd # THIS IS COMING FROM THE MOUNT POINT, NO NEED TO CHANGE
- --experimental-initial-corrupt-check=true
- --experimental-watch-progress-notify-interval=5s
- --initial-advertise-peer-urls=https://172.18.0.8:2380
- --initial-cluster=kind-cluster-ia-control-plane=https://172.18.0.8:2380
- --key-file=/etc/kubernetes/pki/etcd/server.key
- --listen-client-urls=https://127.0.0.1:2379,https://172.18.0.8:2379
- --listen-metrics-urls=http://127.0.0.1:2381
- --listen-peer-urls=https://172.18.0.8:2380
- --name=kind-cluster-ia-control-plane
- --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
- --peer-client-cert-auth=true
- --peer-key-file=/etc/kubernetes/pki/etcd/peer.key
- --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
- --snapshot-count=10000
- --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
image: registry.k8s.io/etcd:3.5.10-0
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 8
httpGet:
host: 127.0.0.1
path: /health?exclude=NOSPACE&serializable=true
port: 2381
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 15
name: etcd
resources:
requests:
cpu: 100m
memory: 100Mi
startupProbe:
failureThreshold: 24
httpGet:
host: 127.0.0.1
path: /health?serializable=false
port: 2381
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 15
volumeMounts:
- mountPath: /var/lib/etcd
name: etcd-data
- mountPath: /etc/kubernetes/pki/etcd
name: etcd-certs
hostNetwork: true
priority: 2000001000
priorityClassName: system-node-critical
securityContext:
seccompProfile:
type: RuntimeDefault
volumes:
- hostPath:
path: /etc/kubernetes/pki/etcd
type: DirectoryOrCreate
name: etcd-certs
- hostPath:
#path: /var/lib/etcd # old
path: /var/lib/etcd-backup # CHANGE THE MOUNT POINT TO THE NEW ONE
type: DirectoryOrCreate
name: etcd-data
status: {}
After doing this, things need to adjust in the system automatically, but if something happens we can restart the static pods by removing and putting them back in place
cd /etc/kubernetes/manifests
mv * ..
# check if there are no more containers running. If there are, you can kill them.
crictl ps
crictl rm podid --force # To remove the container if it exists
mv ../*.yaml .
# to monitor the pods coming back.
watch crictl ps
If you want to check, see that the test pod should no longer exist.