From 05d7386665793b7f8398eb80b4e85adff5486035 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Fri, 20 Oct 2023 18:00:16 +0200 Subject: [PATCH] daemon: daemon.containerRestart: don't cancel restart on context cancel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit def549c8f6716507ee9175fe9d798a42df89e27d passed through the context to the daemon.ContainerStart function. As a result, restarting containers no longer is an atomic operation, because a context cancellation could interrupt the restart (between "stopping" and "(re)starting"), resulting in the container being stopped, but not restarted. Restarting a container, or more factually; making a successful request on the `/containers/{id]/restart` endpoint, should be an atomic operation. This patch uses a context.WithoutCancel for restart requests. It's worth noting that daemon.containerStop already uses context.WithoutCancel, so in that function, we'll be wrapping the context twice, but this should likely not cause issues (just redundant for this code-path). Before this patch, starting a container that bind-mounts the docker socket, then restarting itself from within the container would cancel the restart operation. The container would be stopped, but not started after that: docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh docker exec myself sh -c 'docker restart myself' docker ps -a CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 3a2a741c65ff docker:cli "docker-entrypoint.s…" 26 seconds ago Exited (128) 7 seconds ago myself With this patch: the stop still cancels the exec, but does not cancel the restart operation, and the container is started again: docker run -dit --name myself -v /var/run/docker.sock:/var/run/docker.sock docker:cli sh docker exec myself sh -c 'docker restart myself' docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 4393a01f7c75 docker:cli "docker-entrypoint.s…" About a minute ago Up 4 seconds myself Signed-off-by: Sebastiaan van Stijn (cherry picked from commit aeb8972281af3f86294e167bf45b04a92cf7a9ee) Signed-off-by: Sebastiaan van Stijn --- daemon/restart.go | 5 ++ integration/container/restart_test.go | 72 +++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/daemon/restart.go b/daemon/restart.go index 38828ef8eaa09..d56ce080fc24a 100644 --- a/daemon/restart.go +++ b/daemon/restart.go @@ -6,6 +6,7 @@ import ( containertypes "github.com/docker/docker/api/types/container" "github.com/docker/docker/container" + "github.com/docker/docker/internal/compatcontext" ) // ContainerRestart stops and starts a container. It attempts to @@ -31,6 +32,10 @@ func (daemon *Daemon) ContainerRestart(ctx context.Context, name string, options // gracefully stop, before forcefully terminating the container. If // given a negative duration, wait forever for a graceful stop. func (daemon *Daemon) containerRestart(ctx context.Context, container *container.Container, options containertypes.StopOptions) error { + // Restarting is expected to be an atomic operation, and cancelling + // the request should not cancel the stop -> start sequence. + ctx = compatcontext.WithoutCancel(ctx) + // Determine isolation. If not specified in the hostconfig, use daemon default. actualIsolation := container.HostConfig.Isolation if containertypes.Isolation.IsDefault(actualIsolation) { diff --git a/integration/container/restart_test.go b/integration/container/restart_test.go index 3a6155003a59d..a7aab25cb30e9 100644 --- a/integration/container/restart_test.go +++ b/integration/container/restart_test.go @@ -3,15 +3,18 @@ package container // import "github.com/docker/docker/integration/container" import ( "context" "fmt" + "runtime" "testing" "time" "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" "github.com/docker/docker/client" testContainer "github.com/docker/docker/integration/internal/container" "github.com/docker/docker/testutil/daemon" "gotest.tools/v3/assert" + is "gotest.tools/v3/assert/cmp" "gotest.tools/v3/poll" "gotest.tools/v3/skip" ) @@ -208,3 +211,72 @@ func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) { }) } } + +// TestContainerRestartWithCancelledRequest verifies that cancelling a restart +// request does not cancel the restart operation, and still starts the container +// after it was stopped. +// +// Regression test for https://github.com/moby/moby/discussions/46682 +func TestContainerRestartWithCancelledRequest(t *testing.T) { + ctx := context.TODO() + defer setupTest(t)() + apiClient := testEnv.APIClient() + + // Create a container that ignores SIGTERM and doesn't stop immediately, + // giving us time to cancel the request. + // + // Restarting a container is "stop" (and, if needed, "kill"), then "start" + // the container. We're trying to create the scenario where the "stop" is + // handled, but the request was cancelled and therefore the "start" not + // taking place. + cID := testContainer.Run(ctx, t, apiClient, testContainer.WithCmd("sh", "-c", "trap 'echo received TERM' TERM; while true; do usleep 10; done")) + defer func() { + err := apiClient.ContainerRemove(ctx, cID, types.ContainerRemoveOptions{Force: true}) + if t.Failed() && err != nil { + t.Logf("Cleaning up test container failed with error: %v", err) + } + }() + + // Start listening for events. + messages, errs := apiClient.Events(ctx, types.EventsOptions{ + Filters: filters.NewArgs( + filters.Arg("container", cID), + filters.Arg("event", "restart"), + ), + }) + + // Make restart request, but cancel the request before the container + // is (forcibly) killed. + ctx2, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + stopTimeout := 1 + err := apiClient.ContainerRestart(ctx2, cID, container.StopOptions{ + Timeout: &stopTimeout, + }) + assert.Check(t, is.ErrorIs(err, context.DeadlineExceeded)) + cancel() + + // Validate that the restart event occurred, which is emitted + // after the restart (stop (kill) start) finished. + // + // Note that we cannot use RestartCount for this, as that's only + // used for restart-policies. + restartTimeout := 2 * time.Second + if runtime.GOOS == "windows" { + // hcs can sometimes take a long time to stop container. + restartTimeout = StopContainerWindowsPollTimeout + } + select { + case m := <-messages: + assert.Check(t, is.Equal(m.Actor.ID, cID)) + assert.Check(t, is.Equal(m.Action, "restart")) + case err := <-errs: + assert.NilError(t, err) + case <-time.After(restartTimeout): + t.Errorf("timeout waiting for restart event") + } + + // Container should be restarted (running). + inspect, err := apiClient.ContainerInspect(ctx, cID) + assert.NilError(t, err) + assert.Check(t, is.Equal(inspect.State.Status, "running")) +}