Skip to content

Commit

Permalink
azure: check for primary interface when performing DHCP (canonical#4465)
Browse files Browse the repository at this point in the history
* azure: check for primary interface when performing DHCP

For Savable PPS we rely on a connectivity check to IMDS to determine
if the NIC is primary.  However, in some cases, connectivity may be
delayed and we incorrectly assume the NIC is not primary.

Instead of relying on connectivity, check the DHCP-provided route
configuration for the presence of Wireserver and/or IMDS IPs.

- Return bool from _setup_ephemeral_networking() indicating primary
NIC or not.

- Remove _check_if_nic_is_primary() used in Savable PPS.

- Add a relevant diagnostic to assist debugging failures when a
secondary NIC is chosen.

- Remove some tests that are redundant.

Signed-off-by: Chris Patterson <cpatterson@microsoft.com>
  • Loading branch information
cjp256 committed Oct 23, 2023
1 parent 14b76c4 commit afc7584
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 280 deletions.
101 changes: 63 additions & 38 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@

from cloudinit import net, sources, ssh_util, subp, util
from cloudinit.event import EventScope, EventType
from cloudinit.net import device_driver
from cloudinit.net.dhcp import (
NoDHCPLeaseError,
NoDHCPLeaseInterfaceError,
NoDHCPLeaseMissingDhclientError,
)
from cloudinit.net.ephemeral import EphemeralDHCPv4
from cloudinit.net.ephemeral import EphemeralDHCPv4, EphemeralIPv4Network
from cloudinit.reporting import events
from cloudinit.sources.azure import errors, identity, imds, kvp
from cloudinit.sources.helpers import netlink
Expand Down Expand Up @@ -364,14 +365,32 @@ def _get_subplatform(self):
subplatform_type = "seed-dir"
return "%s (%s)" % (subplatform_type, self.seed)

@azure_ds_telemetry_reporter
def _check_if_primary(self, ephipv4: EphemeralIPv4Network) -> bool:
if not ephipv4.static_routes:
# Primary nics must contain routes.
return False

routed_networks = [r[0] for r in ephipv4.static_routes]
wireserver_route = f"{self._wireserver_endpoint}/32"
primary = any(
n in routed_networks
for n in [
"169.254.169.254/32",
wireserver_route,
]
)

return primary

@azure_ds_telemetry_reporter
def _setup_ephemeral_networking(
self,
*,
iface: Optional[str] = None,
retry_sleep: int = 1,
timeout_minutes: int = 5,
) -> None:
) -> bool:
"""Setup ephemeral networking.
Keep retrying DHCP up to specified number of minutes. This does
Expand All @@ -381,13 +400,19 @@ def _setup_ephemeral_networking(
:param timeout_minutes: Number of minutes to keep retrying for.
:raises NoDHCPLeaseError: If unable to obtain DHCP lease.
:returns: True if NIC is determined to be primary.
"""
if self._ephemeral_dhcp_ctx is not None:
raise RuntimeError(
"Bringing up networking when already configured."
)

LOG.debug("Requested ephemeral networking (iface=%s)", iface)
report_diagnostic_event(
"Bringing up ephemeral networking with iface=%s: %r"
% (iface, net.get_interfaces()),
logger_func=LOG.debug,
)
self._ephemeral_dhcp_ctx = EphemeralDHCPv4(
self.distro,
iface=iface,
Expand Down Expand Up @@ -458,15 +483,37 @@ def _setup_ephemeral_networking(
if lease is None:
self._ephemeral_dhcp_ctx = None
raise NoDHCPLeaseError()
else:
# Ensure iface is set.
self._ephemeral_dhcp_ctx.iface = lease["interface"]

# Update wireserver IP from DHCP options.
if "unknown-245" in lease:
self._wireserver_endpoint = get_ip_from_lease_value(
lease["unknown-245"]
)
# Ensure iface is set.
iface = lease["interface"]
self._ephemeral_dhcp_ctx.iface = iface

# Update wireserver IP from DHCP options.
if "unknown-245" in lease:
self._wireserver_endpoint = get_ip_from_lease_value(
lease["unknown-245"]
)

driver = device_driver(iface)
ephipv4 = self._ephemeral_dhcp_ctx._ephipv4
if ephipv4 is None:
raise RuntimeError("dhcp context missing ephipv4")

primary = self._check_if_primary(ephipv4)
report_diagnostic_event(
"Obtained DHCP lease on interface %r "
"(primary=%r driver=%r router=%r routes=%r lease=%r)"
% (
iface,
primary,
driver,
ephipv4.router,
ephipv4.static_routes,
lease,
),
logger_func=LOG.debug,
)
return primary

@azure_ds_telemetry_reporter
def _teardown_ephemeral_networking(self) -> None:
Expand Down Expand Up @@ -1006,32 +1053,6 @@ def _report_ready_for_pps(
if create_marker:
self._create_report_ready_marker()

@azure_ds_telemetry_reporter
def _check_if_nic_is_primary(self, ifname: str) -> bool:
"""Check if a given interface is the primary nic or not."""
# For now, only a VM's primary NIC can contact IMDS and WireServer. If
# DHCP fails for a NIC, we have no mechanism to determine if the NIC is
# primary or secondary. In this case, retry DHCP until successful.
self._setup_ephemeral_networking(iface=ifname, timeout_minutes=20)

# Primary nic detection will be optimized in the future. The fact that
# primary nic is being attached first helps here. Otherwise each nic
# could add several seconds of delay.
imds_md = self.get_metadata_from_imds(report_failure=False)
if imds_md:
# Only primary NIC will get a response from IMDS.
LOG.info("%s is the primary nic", ifname)
return True

# If we are not the primary nic, then clean the dhcp context.
LOG.warning(
"Failed to fetch IMDS metadata using nic %s. "
"Assuming this is not the primary nic.",
ifname,
)
self._teardown_ephemeral_networking()
return False

@azure_ds_telemetry_reporter
def _wait_for_hot_attached_primary_nic(self, nl_sock):
"""Wait until the primary nic for the vm is hot-attached."""
Expand Down Expand Up @@ -1073,12 +1094,16 @@ def _wait_for_hot_attached_primary_nic(self, nl_sock):
# won't be in primary_nic_found = false state for long.
if not primary_nic_found:
LOG.info("Checking if %s is the primary nic", ifname)
primary_nic_found = self._check_if_nic_is_primary(ifname)
primary_nic_found = self._setup_ephemeral_networking(
iface=ifname, timeout_minutes=20
)

# Exit criteria: check if we've discovered primary nic
if primary_nic_found:
LOG.info("Found primary nic for this VM.")
break
else:
self._teardown_ephemeral_networking()

except AssertionError as error:
report_diagnostic_event(str(error), logger_func=LOG.error)
Expand Down

0 comments on commit afc7584

Please sign in to comment.