kubernetes-sigs
diff --git a/‎cmd/nfd-gc/main.go
Lines changed: 1 addition & 1 deletion b/‎cmd/nfd-gc/main.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎deployment/base/gc/gc.yaml
Lines changed: 10 additions & 0 deletions b/‎deployment/base/gc/gc.yaml
Lines changed: 10 additions & 0 deletions
diff --git a/‎deployment/helm/node-feature-discovery/templates/nfd-gc.yaml
Lines changed: 35 additions & 0 deletions b/‎deployment/helm/node-feature-discovery/templates/nfd-gc.yaml
Lines changed: 35 additions & 0 deletions
diff --git a/‎deployment/helm/node-feature-discovery/values.yaml
Lines changed: 12 additions & 0 deletions b/‎deployment/helm/node-feature-discovery/values.yaml
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/deployment/helm.md
Lines changed: 10 additions & 1 deletion b/‎docs/deployment/helm.md
Lines changed: 10 additions & 1 deletion
diff --git a/‎pkg/nfd-gc/nfd-gc.go
Lines changed: 6 additions & 1 deletion b/‎pkg/nfd-gc/nfd-gc.go
Lines changed: 6 additions & 1 deletion
@@ -84,7 +84,7 @@ func initFlags(flagset *flag.FlagSet) *nfdgarbagecollector.Args {
 	flagset.StringVar(&args.Kubeconfig, "kubeconfig", "",
 		"Kubeconfig to use")
 	flagset.IntVar(&args.Port, "port", 8080,
-		"Port on which to expose metrics.")
+		"Port which metrics and healthz endpoints are served on")
 
 	klog.InitFlags(flagset)
 
 
@@ -19,6 +19,16 @@ spec:
         - name: nfd-gc
           image: gcr.io/k8s-staging-nfd/node-feature-discovery:master
           imagePullPolicy: Always
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: http
+            initialDelaySeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: http
+            initialDelaySeconds: 5
           resources:
             limits:
               cpu: 20m
 
@@ -44,6 +44,41 @@ spec:
       - name: gc
         image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
         imagePullPolicy: "{{ .Values.image.pullPolicy }}"
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: http
+        {{- with .Values.gc.livenessProbe.initialDelaySeconds }}
+          initialDelaySeconds: {{ . }}
+        {{- end }}
+        {{- with .Values.gc.livenessProbe.failureThreshold }}
+          failureThreshold: {{ . }}
+        {{- end }}
+        {{- with .Values.gc.livenessProbe.periodSeconds }}
+          periodSeconds: {{ . }}
+        {{- end }}
+        {{- with .Values.gc.livenessProbe.timeoutSeconds }}
+          timeoutSeconds: {{ . }}
+        {{- end }}
+        readinessProbe:
+          httpGet:
+            path: /healthz
+            port: http
+        {{- with .Values.gc.readinessProbe.initialDelaySeconds }}
+          initialDelaySeconds: {{ . }}
+        {{- end }}
+        {{- with .Values.gc.readinessProbe.failureThreshold }}
+          failureThreshold: {{ . }}
+        {{- end }}
+        {{- with .Values.gc.readinessProbe.periodSeconds }}
+          periodSeconds: {{ . }}
+        {{- end }}
+        {{- with .Values.gc.readinessProbe.timeoutSeconds }}
+          timeoutSeconds: {{ . }}
+        {{- end }}
+        {{- with .Values.gc.readinessProbe.successThreshold }}
+          successThreshold: {{ . }}
+        {{- end }}
         env:
         - name: NODE_NAME
           valueFrom:
 
@@ -565,6 +565,18 @@ gc:
 
   podSecurityContext: {}
 
+  livenessProbe:
+    initialDelaySeconds: 10
+    # failureThreshold: 3
+    # periodSeconds: 10
+    # timeoutSeconds: 1
+  readinessProbe:
+    initialDelaySeconds: 5
+    # failureThreshold: 3
+    # periodSeconds: 10
+    # timeoutSeconds: 1
+    # successThreshold: 1
+
   resources:
     limits:
       memory: 1Gi
 
@@ -330,7 +330,7 @@ API's you need to install the prometheus operator in your cluster.
 | `gc.podDisruptionBudget.minAvailable`               | integer | 1                         | Specifies minAvailable for the podDisruptionBudget configuration for nfd-gc                                                                                    |
 | `gc.podDisruptionBudget.maxUnavailable`             | integer | NULL                      | Specifies maxUnavailable for the podDisruptionBudget configuration for nfd-gc                                                                                  |
 | `gc.podDisruptionBudget.unhealthyPodEvictionPolicy` | string  | AlwaysAllow               | Specifies unhealthyPodEvictionPolicy for the podDisruptionBudget configuration for nfd-gc                                                                      |
-| `gc.port`                                           | integer | 8080                      | Port on which to serve Prometheus metrics.                                                                                                                     |
+| `gc.port`                                           | integer | 8080                      | Port on which to serve http for metrics and health endpoints.                                                                                                  |
 | `gc.nodeSelector`                                   | dict    | {}                        | Garbage collector pod [node selector][nodeselector]                                                                                                            |
 | `gc.tolerations`                                    | dict    | {}                        | Garbage collector pod [node tolerations][toleration]                                                                                                           |
 | `gc.annotations`                                    | dict    | {}                        | Garbage collector pod [annotations][annotations]                                                                                                               |
@@ -339,6 +339,15 @@ API's you need to install the prometheus operator in your cluster.
 | `gc.extraArgs`                                      | array   | []                        | Additional [command line arguments](../reference/gc-commandline-reference.md) to pass to nfd-gc                                                                |
 | `gc.extraEnvs`                                      | array   | []                        | Additional environment variables to pass to nfd-gc                                                                                                             |
 | `gc.revisionHistoryLimit`                           | integer |                           | Specify how many old ReplicaSets for this Deployment you want to retain. [revisionHistoryLimit][revisionhistorylimit]                                          |
+| `gc.livenessProbe.initialDelaySeconds`              | integer | 10                        | Specifies the number of seconds after the container has started before liveness probes are initiated.                                                          |
+| `gc.livenessProbe.failureThreshold`                 | integer | 3 (by Kubernetes)         | Specifies the number of consecutive failures of liveness probes before considering the pod as not ready.                                                       |
+| `gc.livenessProbe.periodSeconds`                    | integer | 10 (by Kubernetes)        | Specifies how often (in seconds) to perform the liveness probe.                                                                                                |
+| `gc.livenessProbe.timeoutSeconds`                   | integer | 1 (by Kubernetes)         | Specifies the number of seconds after which the probe times out.                                                                                               |
+| `gc.readinessProbe.initialDelaySeconds`             | integer | 5                         | Specifies the number of seconds after the container has started before readiness probes are initiated.                                                         |
+| `gc.readinessProbe.failureThreshold`                | integer | 3 (by Kubernetes)         | Specifies the number of consecutive failures of readiness probes before considering the pod as not ready.                                                      |
+| `gc.readinessProbe.periodSeconds`                   | integer | 10 (by Kubernetes)        | Specifies how often (in seconds) to perform the readiness probe.                                                                                               |
+| `gc.readinessProbe.timeoutSeconds`                  | integer | 1 (by Kubernetes)         | Specifies the number of seconds after which the probe times out.                                                                                               |
+| `gc.readinessProbe.successThreshold`                | integer | 1 (by Kubernetes)         | Specifies the number of consecutive successes of readiness probes before considering the pod as ready.                                                         |
 | `gc.dnsPolicy`                                      | array   | ClusterFirstWithHostNet   | Garbage collector pod [dnsPolicy][dnspolicy]                                                                                                                   |
 
 <!-- Links -->
 
@@ -82,6 +82,10 @@ func New(args *Args) (NfdGarbageCollector, error) {
 	}, nil
 }
 
+func (n *nfdGarbageCollector) Healthz(writer http.ResponseWriter, _ *http.Request) {
+	writer.WriteHeader(http.StatusOK)
+}
+
 func (n *nfdGarbageCollector) deleteNodeFeature(namespace, name string) {
 	kind := "NodeFeature"
 	if err := n.client.Resource(gvrNF).Namespace(namespace).Delete(context.TODO(), name, metav1.DeleteOptions{}); err != nil {
@@ -252,7 +256,8 @@ func (n *nfdGarbageCollector) Run() error {
 	httpMux.Handle("/metrics", promhttp.HandlerFor(promRegistry, promhttp.HandlerOpts{}))
 	registerVersion(version.Get())
 
-	// TODO: health probe endpoint could be added here
+	// Register health endpoint (at this point we're "ready and live")
+	httpMux.HandleFunc("/healthz", n.Healthz)
 
 	// Start HTTP server
 	httpServer := http.Server{Addr: fmt.Sprintf(":%d", n.args.Port), Handler: httpMux}