Skip to content

Commit

Permalink
Merge pull request #9458 from dcantah/shim-socket-dial-1.7
Browse files Browse the repository at this point in the history
[release/1.7] runtime/v2: net.Dial gRPC shim sockets before trying grpc
  • Loading branch information
estesp committed Dec 4, 2023
2 parents a477e22 + 80f96cd commit 2206522
Showing 1 changed file with 22 additions and 2 deletions.
24 changes: 22 additions & 2 deletions runtime/v2/shim.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"errors"
"fmt"
"io"
"net"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -253,7 +254,7 @@ func makeConnection(ctx context.Context, params client.BootstrapParams, onClose
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
}
return grpcDialContext(ctx, dialer.DialAddress(params.Address), onClose, gopts...)
return grpcDialContext(ctx, params.Address, onClose, gopts...)
default:
return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol)
}
Expand All @@ -264,10 +265,29 @@ func makeConnection(ctx context.Context, params client.BootstrapParams, onClose
// a callback run when the connection is severed or explicitly closed.
func grpcDialContext(
ctx context.Context,
target string,
address string,
onClose func(),
gopts ...grpc.DialOption,
) (*grpcConn, error) {
// If grpc.WithBlock is specified in gopts this causes the connection to block waiting for
// a connection regardless of if the socket exists or has a listener when Dial begins. This
// specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't
// there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is
// also undesirable as we don't want the background connection behavior, we want to ensure
// a connection before moving on. To bring this in line with the ttrpc connection behavior
// lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice
// here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return
// ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout.
//
// This is especially helpful on restart of containerd as if the shim died while containerd
// was down, we end up waiting the full timeout.
conn, err := net.DialTimeout("unix", address, time.Second*10)
if err != nil {
return nil, err
}
conn.Close()

target := dialer.DialAddress(address)
client, err := grpc.DialContext(ctx, target, gopts...)
if err != nil {
return nil, fmt.Errorf("failed to create GRPC connection: %w", err)
Expand Down

0 comments on commit 2206522

Please sign in to comment.