Skip to content

Commit 02a3ec0

Browse files
authored
Merge pull request #2176 from Kaston-C/gc-healthz
nfd-gc: add healthz endpoint
2 parents 553b269 + d47c774 commit 02a3ec0

File tree

6 files changed

+74
-3
lines changed

6 files changed

+74
-3
lines changed

cmd/nfd-gc/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args {
8484
flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
8585
"Kubeconfig to use")
8686
flagset.IntVar(&args.Port, "port", 8080,
87-
"Port on which to expose metrics.")
87+
"Port which metrics and healthz endpoints are served on")
8888

8989
klog.InitFlags(flagset)
9090

deployment/base/gc/gc.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,16 @@ spec:
1919
- name: nfd-gc
2020
image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
2121
imagePullPolicy: Always
22+
livenessProbe:
23+
httpGet:
24+
path: /healthz
25+
port: http
26+
initialDelaySeconds: 10
27+
readinessProbe:
28+
httpGet:
29+
path: /healthz
30+
port: http
31+
initialDelaySeconds: 5
2232
resources:
2333
limits:
2434
cpu: 20m

deployment/helm/node-feature-discovery/templates/nfd-gc.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,41 @@ spec:
4444
- name: gc
4545
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
4646
imagePullPolicy: "{{ .Values.image.pullPolicy }}"
47+
livenessProbe:
48+
httpGet:
49+
path: /healthz
50+
port: http
51+
{{- with .Values.gc.livenessProbe.initialDelaySeconds }}
52+
initialDelaySeconds: {{ . }}
53+
{{- end }}
54+
{{- with .Values.gc.livenessProbe.failureThreshold }}
55+
failureThreshold: {{ . }}
56+
{{- end }}
57+
{{- with .Values.gc.livenessProbe.periodSeconds }}
58+
periodSeconds: {{ . }}
59+
{{- end }}
60+
{{- with .Values.gc.livenessProbe.timeoutSeconds }}
61+
timeoutSeconds: {{ . }}
62+
{{- end }}
63+
readinessProbe:
64+
httpGet:
65+
path: /healthz
66+
port: http
67+
{{- with .Values.gc.readinessProbe.initialDelaySeconds }}
68+
initialDelaySeconds: {{ . }}
69+
{{- end }}
70+
{{- with .Values.gc.readinessProbe.failureThreshold }}
71+
failureThreshold: {{ . }}
72+
{{- end }}
73+
{{- with .Values.gc.readinessProbe.periodSeconds }}
74+
periodSeconds: {{ . }}
75+
{{- end }}
76+
{{- with .Values.gc.readinessProbe.timeoutSeconds }}
77+
timeoutSeconds: {{ . }}
78+
{{- end }}
79+
{{- with .Values.gc.readinessProbe.successThreshold }}
80+
successThreshold: {{ . }}
81+
{{- end }}
4782
env:
4883
- name: NODE_NAME
4984
valueFrom:

deployment/helm/node-feature-discovery/values.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,18 @@ gc:
565565

566566
podSecurityContext: {}
567567

568+
livenessProbe:
569+
initialDelaySeconds: 10
570+
# failureThreshold: 3
571+
# periodSeconds: 10
572+
# timeoutSeconds: 1
573+
readinessProbe:
574+
initialDelaySeconds: 5
575+
# failureThreshold: 3
576+
# periodSeconds: 10
577+
# timeoutSeconds: 1
578+
# successThreshold: 1
579+
568580
resources:
569581
limits:
570582
memory: 1Gi

docs/deployment/helm.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ API's you need to install the prometheus operator in your cluster.
330330
| `gc.podDisruptionBudget.minAvailable` | integer | 1 | Specifies minAvailable for the podDisruptionBudget configuration for nfd-gc |
331331
| `gc.podDisruptionBudget.maxUnavailable` | integer | NULL | Specifies maxUnavailable for the podDisruptionBudget configuration for nfd-gc |
332332
| `gc.podDisruptionBudget.unhealthyPodEvictionPolicy` | string | AlwaysAllow | Specifies unhealthyPodEvictionPolicy for the podDisruptionBudget configuration for nfd-gc |
333-
| `gc.port` | integer | 8080 | Port on which to serve Prometheus metrics. |
333+
| `gc.port` | integer | 8080 | Port on which to serve http for metrics and health endpoints. |
334334
| `gc.nodeSelector` | dict | {} | Garbage collector pod [node selector][nodeselector] |
335335
| `gc.tolerations` | dict | {} | Garbage collector pod [node tolerations][toleration] |
336336
| `gc.annotations` | dict | {} | Garbage collector pod [annotations][annotations] |
@@ -339,6 +339,15 @@ API's you need to install the prometheus operator in your cluster.
339339
| `gc.extraArgs` | array | [] | Additional [command line arguments](../reference/gc-commandline-reference.md) to pass to nfd-gc |
340340
| `gc.extraEnvs` | array | [] | Additional environment variables to pass to nfd-gc |
341341
| `gc.revisionHistoryLimit` | integer | | Specify how many old ReplicaSets for this Deployment you want to retain. [revisionHistoryLimit][revisionhistorylimit] |
342+
| `gc.livenessProbe.initialDelaySeconds` | integer | 10 | Specifies the number of seconds after the container has started before liveness probes are initiated. |
343+
| `gc.livenessProbe.failureThreshold` | integer | 3 (by Kubernetes) | Specifies the number of consecutive failures of liveness probes before considering the pod as not ready. |
344+
| `gc.livenessProbe.periodSeconds` | integer | 10 (by Kubernetes) | Specifies how often (in seconds) to perform the liveness probe. |
345+
| `gc.livenessProbe.timeoutSeconds` | integer | 1 (by Kubernetes) | Specifies the number of seconds after which the probe times out. |
346+
| `gc.readinessProbe.initialDelaySeconds` | integer | 5 | Specifies the number of seconds after the container has started before readiness probes are initiated. |
347+
| `gc.readinessProbe.failureThreshold` | integer | 3 (by Kubernetes) | Specifies the number of consecutive failures of readiness probes before considering the pod as not ready. |
348+
| `gc.readinessProbe.periodSeconds` | integer | 10 (by Kubernetes) | Specifies how often (in seconds) to perform the readiness probe. |
349+
| `gc.readinessProbe.timeoutSeconds` | integer | 1 (by Kubernetes) | Specifies the number of seconds after which the probe times out. |
350+
| `gc.readinessProbe.successThreshold` | integer | 1 (by Kubernetes) | Specifies the number of consecutive successes of readiness probes before considering the pod as ready. |
342351
| `gc.dnsPolicy` | array | ClusterFirstWithHostNet | Garbage collector pod [dnsPolicy][dnspolicy] |
343352

344353
<!-- Links -->

pkg/nfd-gc/nfd-gc.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ func New(args *Args) (NfdGarbageCollector, error) {
8282
}, nil
8383
}
8484

85+
func (n *nfdGarbageCollector) Healthz(writer http.ResponseWriter, _ *http.Request) {
86+
writer.WriteHeader(http.StatusOK)
87+
}
88+
8589
func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
8690
kind := "NodeFeature"
8791
if err := n.client.Resource(gvrNF).Namespace(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
@@ -252,7 +256,8 @@ func (n *nfdGarbageCollector) Run() error {
252256
httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
253257
registerVersion(version.Get())
254258

255-
// TODO: health probe endpoint could be added here
259+
// Register health endpoint (at this point we're "ready and live")
260+
httpMux.HandleFunc("/healthz", n.Healthz)
256261

257262
// Start HTTP server
258263
httpServer := http.Server{Addr: fmt.Sprintf(":%d", n.args.Port), Handler: httpMux}

0 commit comments

Comments
 (0)