完整的记录 kubernetes 监控从部署到配置。

prometheus operator/statefulset

https://github.com/kubernetes/kubernetes/tree/master/cluster/addons/prometheus

# change namespace
sed -i s/kube-system/monitoring/g *

# dynamic provision storage class

kubectl create -f prometheus-configmap.yaml 
kubectl create -f prometheus-rbac.yaml 
kubectl create -f prometheus-statefulset.yaml 
kubectl create -f prometheus-service.yaml 

# kube-metrics-server
kubectl create -f kube-state-metrics-deployment.yaml
kubectl create -f kube-state-metrics-rbac.yaml
kubectl create -f kube-state-metrics-service.yaml
# node-exporter
kubectl create -f node-exporter-ds.yml
kubectl create -f node-exporter-service.yaml

/etc/systemd/system/kubelet.service.d/10-kubeadm.conf

[Service]
Environment="KUBELET_EXTRA_ARGS=--pod-infra-container-image=harbor.guahao-inc.com/kubernetes/pause-amd64:3.1 --hostname-override=172.27.32.165"
Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf"
Environment="KUBELET_SYSTEM_PODS_ARGS=--pod-manifest-path=/etc/kubernetes/manifests --allow-privileged=true"
Environment="KUBELET_NETWORK_ARGS=--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin"
Environment="KUBELET_DNS_ARGS=--cluster-dns=10.254.0.10 --cluster-domain=cluster.local"
Environment="KUBELET_AUTHZ_ARGS=--authorization-mode=Webhook --client-ca-file=/etc/kubernetes/pki/ca.crt"
Environment="KUBELET_CADVISOR_ARGS=--cadvisor-port=0"
Environment="KUBELET_CGROUP_ARGS=--cgroup-driver=systemd"
Environment="KUBELET_CERTIFICATE_ARGS=--rotate-certificates=true --cert-dir=/var/lib/kubelet/pki"
ExecStart=
ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_SYSTEM_PODS_ARGS $KUBELET_NETWORK_ARGS $KUBELET_DNS_ARGS $KUBELET_AUTHZ_ARGS $KUBELET_CADVISOR_ARGS $KUBELET_CGROUP_ARGS $KUBELET_CERTIFICATE_ARGS $KUBELET_EXTRA_ARGS
# https://github.com/coreos/prometheus-operator/blob/master/Documentation/troubleshooting.md     
# all node
KUBEADM_SYSTEMD_CONF=/etc/systemd/system/kubelet.service.d/10-kubeadm.conf
sed -e "/cadvisor-port=0/d" -i "$KUBEADM_SYSTEMD_CONF"
if ! grep -q "authentication-token-webhook=true" "$KUBEADM_SYSTEMD_CONF"; then
  sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/" -i "$KUBEADM_SYSTEMD_CONF"
fi
systemctl daemon-reload
systemctl restart kubelet

# master
sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-controller-manager.yaml
sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-scheduler.yaml
[Service]
Environment="KUBELET_EXTRA_ARGS=--pod-infra-container-image=harbor.guahao-inc.com/kubernetes/pause-amd64:3.1 --hostname-override=172.27.32.165"
Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf"
Environment="KUBELET_SYSTEM_PODS_ARGS=--pod-manifest-path=/etc/kubernetes/manifests --allow-privileged=true"
Environment="KUBELET_NETWORK_ARGS=--network-plugin=cni --cni-conf-dir=/etc/cni/net.d --cni-bin-dir=/opt/cni/bin"
Environment="KUBELET_DNS_ARGS=--cluster-dns=10.254.0.10 --cluster-domain=cluster.local"
Environment="KUBELET_AUTHZ_ARGS=--authentication-token-webhook=true --authorization-mode=Webhook --client-ca-file=/etc/kubernetes/pki/ca.crt"
Environment="KUBELET_CGROUP_ARGS=--cgroup-driver=systemd"
Environment="KUBELET_CERTIFICATE_ARGS=--rotate-certificates=true --cert-dir=/var/lib/kubelet/pki"
ExecStart=
ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_SYSTEM_PODS_ARGS $KUBELET_NETWORK_ARGS $KUBELET_DNS_ARGS $KUBELET_AUTHZ_ARGS $KUBELET_CADVISOR_ARGS $KUBELET_CGROUP_ARGS $KUBELET_CERTIFICATE_ARGS $KUBELET_EXTRA_ARGS

grafana

docker run -d -p 3000:3000 --name grafana grafana:grafana

# install kubernetes plugin
wget https://grafana.com/api/plugins/grafana-kubernetes-app/versions/1.0.1/download
unzip grafana-kubernetes-app-31da38a.zip
docker cp grafana-kubernetes-app-31da38a/ grafana:/var/lib/grafana/plugins/
docker restart grafana

配置好 grafana 的 kubernetes 地址和证书相关配置,就能够看到 kubernetes 集群相关的监控图表信息了。

Q&A

https://github.com/prometheus/prometheus/wiki/FAQ#error-file-already-closed
prometheus /targets 页面所有监控都是 down 状态,报错:
WAL log samples: log series: write /data/wal/000003: file already closed
log series: no space left on device

原因为磁盘满:

/data/wal $ ls
000001  000005  000009  000013  000017  000021  000025  000029  000033  000037  000041  000045  000049  000053  000057  000061  000065  000069  000073  000077  000081  000085  000089  000093  000097
000002  000006  000010  000014  000018  000022  000026  000030  000034  000038  000042  000046  000050  000054  000058  000062  000066  000070  000074  000078  000082  000086  000090  000094  000098
000003  000007  000011  000015  000019  000023  000027  000031  000035  000039  000043  000047  000051  000055  000059  000063  000067  000071  000075  000079  000083  000087  000091  000095  000099
000004  000008  000012  000016  000020  000024  000028  000032  000036  000040  000044  000048  000052  000056  000060  000064  000068  000072  000076  000080  000084  000088  000092  000096  000100
/data/wal $ pwd

echo > *清理即可。
prometheus 需要设置合理的 retention 时间保证磁盘空间不会被占满。