Skip to content

Commit

Permalink
Merge pull request #121356 from mimowo/backoff-limit-per-index-beta
Browse files Browse the repository at this point in the history
Graduate BackoffLimitPerIndex to Beta
  • Loading branch information
k8s-ci-robot committed Oct 23, 2023
2 parents f9609e7 + 9be1b68 commit 8149ab3
Show file tree
Hide file tree
Showing 10 changed files with 49 additions and 52 deletions.
8 changes: 4 additions & 4 deletions api/openapi-spec/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions api/openapi-spec/v3/apis__batch__v1_openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@
"type": "integer"
},
"backoffLimitPerIndex": {
"description": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable. This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).",
"description": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"format": "int32",
"type": "integer"
},
Expand All @@ -349,7 +349,7 @@
"type": "boolean"
},
"maxFailedIndexes": {
"description": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5. This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).",
"description": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"format": "int32",
"type": "integer"
},
Expand Down Expand Up @@ -443,7 +443,7 @@
"type": "integer"
},
"failedIndexes": {
"description": "FailedIndexes holds the failed indexes when backoffLimitPerIndex=true. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).",
"description": "FailedIndexes holds the failed indexes when backoffLimitPerIndex=true. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"type": "string"
},
"ready": {
Expand Down Expand Up @@ -580,7 +580,7 @@
"properties": {
"action": {
"default": "",
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n This value is alpha-level. It can be used when the\n `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n This value is beta-level. It can be used when the\n `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"type": "string"
},
"onExitCodes": {
Expand Down
12 changes: 6 additions & 6 deletions pkg/apis/batch/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,8 @@ type JobSpec struct {
// batch.kubernetes.io/job-index-failure-count annotation. It can only
// be set when Job's completionMode=Indexed, and the Pod's restart
// policy is Never. The field is immutable.
// This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (disabled by default).
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
BackoffLimitPerIndex *int32

Expand All @@ -319,8 +319,8 @@ type JobSpec struct {
// It can only be specified when backoffLimitPerIndex is set.
// It can be null or up to completions. It is required and must be
// less than or equal to 10^4 when is completions greater than 10^5.
// This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (disabled by default).
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
MaxFailedIndexes *int32

Expand Down Expand Up @@ -481,8 +481,8 @@ type JobStatus struct {
// last element of the series, separated by a hyphen.
// For example, if the failed indexes are 1, 3, 4, 5 and 7, they are
// represented as "1,3-5,7".
// This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (disabled by default).
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
FailedIndexes *string

Expand Down
13 changes: 2 additions & 11 deletions pkg/controller/job/job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,6 @@ var (
MaxPodCreateDeletePerSync = 500
)

const (
// MaxFailedIndexesExceeded indicates that an indexed of a job failed
// https://kep.k8s.io/3850
// In Beta, this should be moved to staging as an API field.
jobReasonMaxFailedIndexesExceeded string = "MaxFailedIndexesExceeded"
// FailedIndexes means Job has failed indexes.
jobReasonFailedIndexes string = "FailedIndexes"
)

// Controller ensures that all Job objects have corresponding pods to
// run their configured workload.
type Controller struct {
Expand Down Expand Up @@ -847,9 +838,9 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods)
if jobCtx.finishedCondition == nil {
if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) {
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobReasonMaxFailedIndexesExceeded, "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now())
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonMaxFailedIndexesExceeded, "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now())
} else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) {
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobReasonFailedIndexes, "Job has failed indexes", jm.clock.Now())
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonFailedIndexes, "Job has failed indexes", jm.clock.Now())
}
}
jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/job/job_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3761,7 +3761,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: jobReasonFailedIndexes,
Reason: batch.JobReasonFailedIndexes,
Message: "Job has failed indexes",
},
},
Expand Down Expand Up @@ -3799,7 +3799,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
{
Type: batch.JobFailed,
Status: v1.ConditionTrue,
Reason: jobReasonMaxFailedIndexesExceeded,
Reason: batch.JobReasonMaxFailedIndexesExceeded,
Message: "Job has exceeded the specified maximal number of failed indexes",
},
},
Expand Down
3 changes: 2 additions & 1 deletion pkg/features/kube_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ const (
// owner: @mimowo
// kep: https://kep.k8s.io/3850
// alpha: v1.28
// beta: v1.29
//
// Allows users to specify counting of failed pods per index.
JobBackoffLimitPerIndex featuregate.Feature = "JobBackoffLimitPerIndex"
Expand Down Expand Up @@ -976,7 +977,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS

IPTablesOwnershipCleanup: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.30

JobBackoffLimitPerIndex: {Default: false, PreRelease: featuregate.Alpha},
JobBackoffLimitPerIndex: {Default: true, PreRelease: featuregate.Beta},

JobPodFailurePolicy: {Default: true, PreRelease: featuregate.Beta},

Expand Down

0 comments on commit 8149ab3

Please sign in to comment.