Re-use the last desired patch on empty batch (#3453)

actions · May 17, 2024 · ab92e4e · ab92e4e
1 parent fa7a4f5
commit ab92e4e
Show file tree

Hide file tree

Showing 3 changed files with 376 additions and 38 deletions.
diff --git a/cmd/ghalistener/worker/worker.go b/cmd/ghalistener/worker/worker.go
@@ -38,20 +38,20 @@ type Config struct {
 // The Worker's role is to process the messages it receives from the listener.
 // It then initiates Kubernetes API requests to carry out the necessary actions.
 type Worker struct {
-	clientset   *kubernetes.Clientset
-	config      Config
-	lastPatch   int
-	lastPatchID int
-	logger      *logr.Logger
+	clientset *kubernetes.Clientset
+	config    Config
+	lastPatch int
+	patchSeq  int
+	logger    *logr.Logger
 }
 
 var _ listener.Handler = (*Worker)(nil)
 
 func New(config Config, options ...Option) (*Worker, error) {
 	w := &Worker{
-		config:      config,
-		lastPatch:   -1,
-		lastPatchID: -1,
+		config:    config,
+		lastPatch: -1,
+		patchSeq:  -1,
 	}
 
 	conf, err := rest.InClusterConfig()
@@ -163,27 +163,8 @@ func (w *Worker) HandleJobStarted(ctx context.Context, jobInfo *actions.JobStart
 // The function then scales the ephemeral runner set by applying the merge patch.
 // Finally, it logs the scaled ephemeral runner set details and returns nil if successful.
 // If any error occurs during the process, it returns an error with a descriptive message.
-func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int, jobsCompleted int) (int, error) {
-	// Max runners should always be set by the resource builder either to the configured value,
-	// or the maximum int32 (resourcebuilder.newAutoScalingListener()).
-	targetRunnerCount := min(w.config.MinRunners+count, w.config.MaxRunners)
-
-	logValues := []any{
-		"assigned job", count,
-		"decision", targetRunnerCount,
-		"min", w.config.MinRunners,
-		"max", w.config.MaxRunners,
-		"currentRunnerCount", w.lastPatch,
-		"jobsCompleted", jobsCompleted,
-	}
-
-	if count == 0 && jobsCompleted == 0 {
-		w.lastPatchID = 0
-	} else {
-		w.lastPatchID++
-	}
-
-	w.lastPatch = targetRunnerCount
+func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count, jobsCompleted int) (int, error) {
+	patchID := w.setDesiredWorkerState(count, jobsCompleted)
 
 	original, err := json.Marshal(
 		&v1alpha1.EphemeralRunnerSet{
@@ -200,8 +181,8 @@ func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int, jobsCo
 	patch, err := json.Marshal(
 		&v1alpha1.EphemeralRunnerSet{
 			Spec: v1alpha1.EphemeralRunnerSetSpec{
-				Replicas: targetRunnerCount,
-				PatchID:  w.lastPatchID,
+				Replicas: w.lastPatch,
+				PatchID:  patchID,
 			},
 		},
 	)
@@ -210,14 +191,13 @@ func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int, jobsCo
 		return 0, err
 	}
 
+	w.logger.Info("Compare", "original", string(original), "patch", string(patch))
 	mergePatch, err := jsonpatch.CreateMergePatch(original, patch)
 	if err != nil {
 		return 0, fmt.Errorf("failed to create merge patch json for ephemeral runner set: %w", err)
 	}
 
-	w.logger.Info("Created merge patch json for EphemeralRunnerSet update", "json", string(mergePatch))
-
-	w.logger.Info("Scaling ephemeral runner set", logValues...)
+	w.logger.Info("Preparing EphemeralRunnerSet update", "json", string(mergePatch))
 
 	patchedEphemeralRunnerSet := &v1alpha1.EphemeralRunnerSet{}
 	err = w.clientset.RESTClient().
@@ -238,5 +218,40 @@ func (w *Worker) HandleDesiredRunnerCount(ctx context.Context, count int, jobsCo
 		"name", w.config.EphemeralRunnerSetName,
 		"replicas", patchedEphemeralRunnerSet.Spec.Replicas,
 	)
-	return targetRunnerCount, nil
+	return w.lastPatch, nil
+}
+
+// calculateDesiredState calculates the desired state of the worker based on the desired count and the the number of jobs completed.
+func (w *Worker) setDesiredWorkerState(count, jobsCompleted int) int {
+	// Max runners should always be set by the resource builder either to the configured value,
+	// or the maximum int32 (resourcebuilder.newAutoScalingListener()).
+	targetRunnerCount := min(w.config.MinRunners+count, w.config.MaxRunners)
+	w.patchSeq++
+	desiredPatchID := w.patchSeq
+
+	if count == 0 && jobsCompleted == 0 { // empty batch
+		targetRunnerCount = max(w.lastPatch, targetRunnerCount)
+		if targetRunnerCount == w.config.MinRunners {
+			// We have an empty batch, and the last patch was the min runners.
+			// Since this is an empty batch, and we are at the min runners, they should all be idle.
+			// If controller created few more pods on accident (during scale down events),
+			// this situation allows the controller to scale down to the min runners.
+			// However, it is important to keep the patch sequence increasing so we don't ignore one batch.
+			desiredPatchID = 0
+		}
+	}
+
+	w.lastPatch = targetRunnerCount
+
+	w.logger.Info(
+		"Calculated target runner count",
+		"assigned job", count,
+		"decision", targetRunnerCount,
+		"min", w.config.MinRunners,
+		"max", w.config.MaxRunners,
+		"currentRunnerCount", w.lastPatch,
+		"jobsCompleted", jobsCompleted,
+	)
+
+	return desiredPatchID
 }