// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package osl

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"net"
	"os"
	"path/filepath"
	"slices"
	"sort"
	"strconv"
	"strings"
	"syscall"
	"time"

	"github.com/containerd/log"
	"github.com/docker/docker/internal/nlwrap"
	"github.com/docker/docker/libnetwork/internal/l2disco"
	"github.com/docker/docker/libnetwork/ns"
	"github.com/docker/docker/libnetwork/types"
	"github.com/vishvananda/netlink"
	"github.com/vishvananda/netns"
	"go.opentelemetry.io/otel"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/trace"
	"golang.org/x/sys/unix"
)

const (
	// AdvertiseAddrNMsgsMin defines the minimum number of ARP/NA messages sent when an
	// interface is configured.
	// Zero can be used to disable unsolicited ARP/NA.
	AdvertiseAddrNMsgsMin = 0
	// AdvertiseAddrNMsgsMax defines the maximum number of ARP/NA messages sent when an
	// interface is configured. It's three, to match RFC-5227 Section 1.1
	//	// ("PROBE_NUM=3") and RFC-4861 MAX_NEIGHBOR_ADVERTISEMENT.
	AdvertiseAddrNMsgsMax = 3
	// advertiseAddrNMsgsDefault is the default number of ARP/NA messages sent when
	// an interface is configured.
	advertiseAddrNMsgsDefault = 3

	// AdvertiseAddrIntervalMin defines the minimum interval between ARP/NA messages
	// sent when an interface is configured. The min defined here is nonstandard,
	// RFC-5227 PROBE_MIN and the default for RetransTimer in RFC-4861 are one
	// second. But, faster resends may be useful in a bridge network (where packets
	// are not transmitted on a real network).
	AdvertiseAddrIntervalMin = 100 * time.Millisecond
	// AdvertiseAddrIntervalMax defines the maximum interval between ARP/NA messages
	// sent when an interface is configured. The max of 2s matches RFC-5227
	// PROBE_MAX.
	AdvertiseAddrIntervalMax = 2 * time.Second
	// advertiseAddrIntervalDefault is the default interval between ARP/NA messages
	// sent when and interface is configured.
	// One second matches RFC-5227 PROBE_MIN and the default for RetransTimer in RFC-4861.
	advertiseAddrIntervalDefault = time.Second
)

// newInterface creates a new interface in the given namespace using the
// provided options.
func newInterface(ns *Namespace, srcName, dstPrefix, dstName string, options ...IfaceOption) (*Interface, error) {
	i := &Interface{
		stopCh:                make(chan struct{}),
		srcName:               srcName,
		dstPrefix:             dstPrefix,
		dstName:               dstName,
		advertiseAddrNMsgs:    advertiseAddrNMsgsDefault,
		advertiseAddrInterval: advertiseAddrIntervalDefault,
		ns:                    ns,
	}
	for _, opt := range options {
		if opt != nil {
			// TODO(thaJeztah): use multi-error instead of returning early.
			if err := opt(i); err != nil {
				return nil, err
			}
		}
	}
	if i.master != "" {
		i.dstMaster = ns.findDst(i.master, true)
		if i.dstMaster == "" {
			return nil, fmt.Errorf("could not find an appropriate master %q for %q", i.master, i.srcName)
		}
	}
	return i, nil
}

// Interface represents the settings and identity of a network device.
// It is used as a return type for Network.Link, and it is common practice
// for the caller to use this information when moving interface SrcName from
// host namespace to DstName in a different net namespace with the appropriate
// network settings.
type Interface struct {
	stopCh      chan struct{} // stopCh is closed before the interface is deleted.
	srcName     string
	dstPrefix   string
	dstName     string
	master      string
	dstMaster   string
	mac         net.HardwareAddr
	address     *net.IPNet
	addressIPv6 *net.IPNet
	llAddrs     []*net.IPNet
	routes      []*net.IPNet
	bridge      bool
	sysctls     []string
	// advertiseAddrNMsgs is the number of unsolicited ARP/NA messages that will be sent to
	// advertise the interface's addresses. No messages will be sent if this is zero.
	advertiseAddrNMsgs int
	// advertiseAddrInterval is the interval between unsolicited ARP/NA messages sent to
	// advertise the interface's addresses.
	advertiseAddrInterval time.Duration
	createdInContainer    bool
	ns                    *Namespace
}

// SrcName returns the name of the interface in the origin network namespace.
func (i *Interface) SrcName() string {
	return i.srcName
}

// DstName returns the final interface name in the target network namespace.
// It's generated based on the prefix passed to [Namespace.AddInterface].
func (i *Interface) DstName() string {
	return i.dstName
}

func (i *Interface) DstMaster() string {
	return i.dstMaster
}

// Bridge returns true if the interface is a bridge.
func (i *Interface) Bridge() bool {
	return i.bridge
}

func (i *Interface) MacAddress() net.HardwareAddr {
	return types.GetMacCopy(i.mac)
}

// Address returns the IPv4 address for the interface.
func (i *Interface) Address() *net.IPNet {
	return types.GetIPNetCopy(i.address)
}

// AddressIPv6 returns the IPv6 address for the interface.
func (i *Interface) AddressIPv6() *net.IPNet {
	return types.GetIPNetCopy(i.addressIPv6)
}

// LinkLocalAddresses returns the link-local IP addresses assigned to the
// interface.
func (i *Interface) LinkLocalAddresses() []*net.IPNet {
	return i.llAddrs
}

// Routes returns IP routes for the interface.
func (i *Interface) Routes() []*net.IPNet {
	routes := make([]*net.IPNet, len(i.routes))
	for index, route := range i.routes {
		routes[index] = types.GetIPNetCopy(route)
	}

	return routes
}

// Remove an interface from the sandbox by renaming to original name
// and moving it out of the sandbox.
func (i *Interface) Remove() error {
	nameSpace := i.ns
	return nameSpace.RemoveInterface(i)
}

// Statistics returns the sandbox's side veth interface statistics.
func (i *Interface) Statistics() (*types.InterfaceStatistics, error) {
	l, err := i.ns.nlHandle.LinkByName(i.DstName())
	if err != nil {
		return nil, fmt.Errorf("failed to retrieve the statistics for %s in netns %s: %v", i.DstName(), i.ns.path, err)
	}

	stats := l.Attrs().Statistics
	if stats == nil {
		return nil, fmt.Errorf("no statistics were returned")
	}

	return &types.InterfaceStatistics{
		RxBytes:   stats.RxBytes,
		TxBytes:   stats.TxBytes,
		RxPackets: stats.RxPackets,
		TxPackets: stats.TxPackets,
		RxDropped: stats.RxDropped,
		TxDropped: stats.TxDropped,
	}, nil
}

func (n *Namespace) findDst(srcName string, isBridge bool) string {
	n.mu.Lock()
	defer n.mu.Unlock()

	for _, i := range n.iFaces {
		// The master should match the srcname of the interface and the
		// master interface should be of type bridge, if searching for a bridge type
		if i.SrcName() == srcName && (!isBridge || i.Bridge()) {
			return i.DstName()
		}
	}

	return ""
}

func moveLink(ctx context.Context, nlhHost nlwrap.Handle, iface netlink.Link, i *Interface, nsh netns.NsHandle) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.moveLink", trace.WithAttributes(
		attribute.String("ifaceName", i.DstName())))
	defer span.End()

	if err := nlhHost.LinkSetNsFd(iface, int(nsh)); err != nil {
		return fmt.Errorf("failed to set namespace on link %q: %v", i.srcName, err)
	}
	return nil
}

// AddInterface creates an Interface that represents an existing network
// interface (except for bridge interfaces, which are created here).
//
// The network interface will be reconfigured according the options passed, and
// it'll be renamed from srcName into either dstName if it's not empty, or to
// an auto-generated dest name that combines the provided dstPrefix and a
// numeric suffix.
//
// It's safe to call concurrently.
func (n *Namespace) AddInterface(ctx context.Context, srcName, dstPrefix, dstName string, options ...IfaceOption) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.AddInterface", trace.WithAttributes(
		attribute.String("srcName", srcName),
		attribute.String("dstPrefix", dstPrefix)))
	defer span.End()

	newNs := netns.None()
	if !n.isDefault {
		var err error
		newNs, err = netns.GetFromPath(n.path)
		if err != nil {
			return fmt.Errorf("failed get network namespace %q: %v", n.path, err)
		}
		defer newNs.Close()
	}

	i, iface, err := n.createInterface(ctx, newNs, srcName, dstPrefix, dstName, options...)
	if err != nil {
		return err
	}

	// Configure the interface now this is moved in the proper namespace.
	if err := n.configureInterface(ctx, n.nlHandle, iface, i); err != nil {
		// If configuring the device fails move it back to the host namespace
		// and change the name back to the source name. This allows the caller
		// to properly cleanup the interface. Its important especially for
		// interfaces with global attributes, ex: vni id for vxlan interfaces.
		if nerr := n.nlHandle.LinkSetName(iface, i.SrcName()); nerr != nil {
			log.G(ctx).Errorf("renaming interface (%s->%s) failed, %v after config error %v", i.DstName(), i.SrcName(), nerr, err)
		}
		if nerr := n.nlHandle.LinkSetNsFd(iface, ns.ParseHandlerInt()); nerr != nil {
			log.G(ctx).Errorf("moving interface %s to host ns failed, %v, after config error %v", i.SrcName(), nerr, err)
		}
		return err
	}

	// Up the interface.
	cnt := 0
	for err = n.nlHandle.LinkSetUp(iface); err != nil && cnt < 3; cnt++ {
		ctx, span2 := otel.Tracer("").Start(ctx, "libnetwork.osl.retryingLinkUp", trace.WithAttributes(
			attribute.String("srcName", srcName),
			attribute.String("dstPrefix", dstPrefix)))
		defer span2.End()
		log.G(ctx).Debugf("retrying link setup because of: %v", err)
		time.Sleep(10 * time.Millisecond)
		err = n.nlHandle.LinkSetUp(iface)
	}
	if err != nil {
		return fmt.Errorf("failed to set link up: %v", err)
	}
	log.G(ctx).Debug("link has been set to up")

	// Set the routes on the interface. This can only be done when the interface is up.
	if err := setInterfaceRoutes(ctx, n.nlHandle, iface, i); err != nil {
		return fmt.Errorf("error setting interface %q routes to %q: %v", iface.Attrs().Name, i.Routes(), err)
	}

	// Wait for the interface to be up and running (or a timeout).
	up, err := waitForIfUpped(ctx, newNs, iface.Attrs().Index)
	if err != nil {
		return err
	}

	// If the interface is up, send unsolicited ARP/NA messages if necessary.
	if up {
		waitForBridgePort(ctx, ns.NlHandle(), iface)
		mcastRouteOk := waitForMcastRoute(ctx, iface.Attrs().Index, i, n.nlHandle)
		if err := n.advertiseAddrs(ctx, iface.Attrs().Index, i, n.nlHandle, mcastRouteOk); err != nil {
			return fmt.Errorf("failed to advertise addresses: %w", err)
		}
	}

	return nil
}

// createInterface creates a new Interface, moves the underlying link into the
// target network namespace (if needed), and adds the interface to [Namespace.iFaces].
//
// If dstName is empty, createInterface will generate a unique suffix and
// append it to dstPrefix.
//
// It's safe to call concurrently.
func (n *Namespace) createInterface(ctx context.Context, targetNs netns.NsHandle, srcName, dstPrefix, dstName string, options ...IfaceOption) (*Interface, netlink.Link, error) {
	i, err := newInterface(n, srcName, dstPrefix, dstName, options...)
	if err != nil {
		return nil, nil, err
	}

	// It is not safe to call generateIfaceName and createInterface
	// concurrently, so the Namespace need to be locked until the interface
	// is added to n.iFaces.
	n.mu.Lock()
	defer n.mu.Unlock()

	if n.isDefault {
		i.dstName = i.srcName
	} else if i.dstName == "" {
		i.dstName = n.generateIfaceName(dstPrefix)
	}

	nlhHost := ns.NlHandle()

	// If it is a bridge interface we have to create the bridge inside
	// the namespace so don't try to lookup the interface using srcName
	if i.bridge {
		if err := n.nlHandle.LinkAdd(&netlink.Bridge{
			LinkAttrs: netlink.LinkAttrs{
				Name: i.srcName,
			},
		}); err != nil {
			return nil, nil, fmt.Errorf("failed to create bridge %q: %v", i.srcName, err)
		}
	} else if !i.createdInContainer {
		// Find the network interface identified by the SrcName attribute.
		iface, err := nlhHost.LinkByName(i.srcName)
		if err != nil {
			return nil, nil, fmt.Errorf("failed to get link by name %q: %v", i.srcName, err)
		}

		// Move the network interface to the destination
		// namespace only if the namespace is not a default
		// type
		if !n.isDefault {
			if err := moveLink(ctx, nlhHost, iface, i, targetNs); err != nil {
				return nil, nil, err
			}
		}
	}

	// Find the network interface identified by the SrcName attribute.
	iface, err := n.nlHandle.LinkByName(i.srcName)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to get link by name %q: %v", i.srcName, err)
	}

	// Down the interface before configuring
	if err := n.nlHandle.LinkSetDown(iface); err != nil {
		return nil, nil, fmt.Errorf("failed to set link down: %v", err)
	}

	if err := setInterfaceName(ctx, n.nlHandle, iface, i); err != nil {
		return nil, nil, fmt.Errorf("error renaming interface %q to %q: %w", iface.Attrs().Name, i.DstName(), err)
	}

	n.iFaces = append(n.iFaces, i)

	return i, iface, nil
}

func (n *Namespace) generateIfaceName(prefix string) string {
	var suffixes []int
	for _, i := range n.iFaces {
		if s, ok := strings.CutPrefix(i.DstName(), prefix); ok {
			// Ignore non-numerical prefixes and negative suffixes (they're
			// treated as a different prefix).
			if v, err := strconv.Atoi(s); err == nil && v >= 0 && s != "-0" {
				suffixes = append(suffixes, v)
			}
		}
	}

	sort.Ints(suffixes)

	// There are gaps in the numbering; find the first unused number.
	//
	// An alternative implementation could be to look at the highest suffix,
	// and increment it. But, if that incremented number makes the interface
	// name overflow the IFNAMSIZ limit (= 16 chars), the kernel would reject
	// that interface name while there are other unused numbers. So, instead
	// use the lowest suffix available.
	for i := 0; i < len(suffixes); i++ {
		if i != suffixes[i] {
			return prefix + strconv.Itoa(i)
		}
	}

	return prefix + strconv.Itoa(len(suffixes))
}

func waitForIfUpped(ctx context.Context, ns netns.NsHandle, ifIndex int) (bool, error) {
	ctx, span := otel.Tracer("").Start(context.WithoutCancel(ctx), "libnetwork.osl.waitforIfUpped")
	defer span.End()

	update := make(chan netlink.LinkUpdate, 100)
	upped := make(chan struct{})
	opts := netlink.LinkSubscribeOptions{
		ListExisting: true, // in case the link is already up
		ErrorCallback: func(err error) {
			select {
			case <-upped:
				// Ignore errors sent after the upped channel is closed, the netlink
				// package sends an EAGAIN after it closes its netlink socket when it
				// sees this channel is closed. (No message is ever sent on upped.)
				return
			default:
			}
			log.G(ctx).WithFields(log.Fields{
				"ifi":   ifIndex,
				"error": err,
			}).Info("netlink error while waiting for interface up")
		},
	}
	if ns.IsOpen() {
		opts.Namespace = &ns
	}
	if err := nlwrap.LinkSubscribeWithOptions(update, upped, opts); err != nil {
		return false, fmt.Errorf("failed to subscribe to link updates: %w", err)
	}

	// When done (interface upped, or timeout), stop the LinkSubscribe and drain
	// the result channel. If the result channel isn't closed after a timeout,
	// log a warning to note the goroutine leak.
	defer func() {
		close(upped)
		drainTimerC := time.After(3 * time.Second)
		for {
			select {
			case _, ok := <-update:
				if !ok {
					return
				}
			case <-drainTimerC:
				log.G(ctx).Warn("timeout while waiting for LinkSubscribe to terminate")
			}
		}
	}()

	timerC := time.After(5 * time.Second)
	for {
		select {
		case <-timerC:
			log.G(ctx).Warnf("timeout in waitForIfUpped")
			return false, nil
		case u, ok := <-update:
			if !ok {
				// The netlink package failed to read from its netlink socket. It will
				// already have called the ErrorCallback, so the issue has been logged.
				return false, nil
			}
			if u.Attrs().Index != ifIndex {
				continue
			}
			log.G(ctx).WithFields(log.Fields{
				"iface": u.Attrs().Name,
				"ifi":   u.Attrs().Index,
				"flags": deviceFlags(u.Flags),
			}).Debug("link update")
			if u.Flags&unix.IFF_UP == unix.IFF_UP {
				return true, nil
			}
		}
	}
}

// waitForBridgePort checks whether link iface is a veth. If it is, and the other
// end of the veth is slaved to a bridge, waits for up to maxWait for the bridge
// port's state to be "forwarding". If STP is enabled on the bridge, it doesn't
// wait. If the port is still not forwarding when this returns, at-least the
// first unsolicited ARP/NA packets may be dropped.
func waitForBridgePort(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link) {
	if iface.Type() != "veth" {
		return
	}
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.waitForBridgePort")
	defer span.End()
	ctx = log.WithLogger(ctx, log.G(ctx).WithField("veth", iface.Attrs().Name))

	// The parent of a veth is the other end of the veth.
	parentIndex := iface.Attrs().ParentIndex
	if parentIndex <= 0 {
		log.G(ctx).Debug("veth has no parent index")
		return
	}
	parentIface, err := nlh.LinkByIndex(parentIndex)
	if err != nil {
		// The parent isn't in the host's netns, it's probably in a swarm load-balancer
		// sandbox, and we don't know where that is. But, swarm still uses IP-based MAC
		// addresses so the unsolicited ARPs aren't essential. If the first one goes
		// missing because the bridge's port isn't forwarding yet, it's ok.
		log.G(ctx).WithFields(log.Fields{"parentIndex": parentIndex, "error": err}).Debug("No parent interface")
		return
	}
	// If the other end of the veth has a MasterIndex, that's a bridge.
	if parentIface.Attrs().MasterIndex <= 0 {
		log.G(ctx).Debug("veth is not connected to a bridge")
		return
	}
	bridgeIface, err := nlh.LinkByIndex(parentIface.Attrs().MasterIndex)
	if err != nil {
		log.G(ctx).WithFields(log.Fields{
			"parentIndex": parentIndex,
			"masterIndex": parentIface.Attrs().MasterIndex,
			"error":       err,
		}).Warn("No parent bridge link")
		return
	}

	// Ideally, we'd read the port state via netlink. But, vishvananda/netlink needs a
	// patch to include state in its response.
	// - type Protinfo needs a "State uint8"
	// - parseProtinfo() needs "case nl.IFLA_BRPORT_STATE: pi.State = uint8(info.Value[0])"
	/*
		pi, err := nlh.LinkGetProtinfo(parentIface)
		if err != nil {
			return fmt.Errorf("get bridge protinfo: %w", err)
		}
	*/

	// Check that STP is not enabled on the bridge. It won't be enabled on a
	// bridge network's own bridge. But, could be on a user-supplied bridge
	// and, if it is, it won't be forwarding within the timeout here.
	if stpEnabled(ctx, bridgeIface.Attrs().Name) {
		log.G(ctx).Info("STP is enabled, not waiting for port to be forwarding")
		return
	}

	// Read the port state from "/sys/class/net/<bridge>/brif/<veth>/state".
	var portStateFile *os.File
	path := filepath.Join("/sys/class/net", bridgeIface.Attrs().Name, "brif", parentIface.Attrs().Name, "state")
	portStateFile, err = os.Open(path)
	if err != nil {
		// In integration tests where the daemon is running in its own netns, the bridge
		// device isn't visible in "/sys/class/net". So, just wait for hopefully-long-enough
		// for the bridge's port to be ready.
		log.G(ctx).WithField("port", path).Warn("Failed to open port state file, waiting for 20ms")
		time.Sleep(20 * time.Millisecond)
		return
	}
	defer portStateFile.Close()

	// Poll the bridge port's state until it's "forwarding". (By now, it should be. So, poll
	// quickly, and not for long.)
	const pollInterval = 10 * time.Millisecond
	const maxWait = 200 * time.Millisecond
	var stateFileContent [2]byte
	for range int64(maxWait / pollInterval) {
		n, err := portStateFile.ReadAt(stateFileContent[:], 0)
		if err != nil {
			log.G(ctx).WithFields(log.Fields{
				"filename": path,
				"error":    err,
			}).Warn("Failed to read bridge port state")
			return
		}
		if n == 0 {
			log.G(ctx).WithField("filename", path).Warn("Empty bridge port state file")
			return
		}
		// Forwarding is state '3'.
		// https://elixir.bootlin.com/linux/v6.13/source/include/uapi/linux/if_bridge.h#L49-L53
		if stateFileContent[0] != '3' {
			log.G(ctx).WithField("portState", stateFileContent[0]).Debug("waiting for bridge port to be forwarding")
			time.Sleep(pollInterval)
			continue
		}
		log.G(ctx).Debug("Bridge port is forwarding")
		return
	}
	log.G(ctx).WithFields(log.Fields{
		"portState": stateFileContent[0],
		"waitTime":  maxWait,
	}).Warn("Bridge port not forwarding")
}

// stpEnabled returns true if "/sys/class/net/<name>/bridge/stp_state" can be read
// and does not contain "0".
func stpEnabled(ctx context.Context, name string) bool {
	stpStateFilename := filepath.Join("/sys/class/net", name, "bridge/stp_state")
	stpState, err := os.ReadFile(stpStateFilename)
	if err != nil {
		log.G(ctx).WithError(err).Warnf("Failed to read stp_state file %q", stpStateFilename)
		return false
	}
	return len(stpState) > 0 && stpState[0] != '0'
}

// waitForMcastRoute waits for an interface to have a route from ::1 to the IPv6 LL all-nodes
// address (ff02::1), if that route is needed to send a neighbour advertisement for an IPv6
// interface address.
//
// After waiting, or a failure, if there is no route - no error is returned. The NA send may
// fail, but try it anyway.
//
// In CI, the NA send failed with "write ip ::1->ff02::1: sendmsg: network is unreachable".
// That error has not been seen since addition of the check that the veth's parent bridge port
// is forwarding, so that may have been the issue. But, in case it's a timing problem that's
// only less-likely because of delay caused by that check, make sure the route exists.
func waitForMcastRoute(ctx context.Context, ifIndex int, i *Interface, nlh nlwrap.Handle) bool {
	if i.addressIPv6 == nil || i.advertiseAddrNMsgs == 0 {
		return true
	}
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.waitForMcastRoute")
	defer span.End()

	const pollInterval = 10 * time.Millisecond
	const maxWait = 200 * time.Millisecond
	for range int64(maxWait / pollInterval) {
		routes, err := nlh.RouteGetWithOptions(net.IPv6linklocalallnodes, &netlink.RouteGetOptions{
			IifIndex: ifIndex,
			SrcAddr:  net.IPv6loopback,
		})
		if errors.Is(err, unix.EMSGSIZE) {
			// FIXME(robmry) - if EMSGSIZE is returned (why?), it seems to be persistent.
			//  So, skip the delay and continue to the NA send as it seems to succeed.
			log.G(ctx).Info("Skipping check for route to send NA, EMSGSIZE")
			return true
		}
		if err != nil || len(routes) == 0 {
			log.G(ctx).WithFields(log.Fields{"error": err, "nroutes": len(routes)}).Info("Waiting for route to send NA")
			time.Sleep(pollInterval)
			continue
		}
		return true
	}
	log.G(ctx).WithField("", maxWait).Warn("No route for neighbour advertisement")
	return false
}

// advertiseAddrs triggers send unsolicited ARP and Neighbour Advertisement
// messages, so that caches are updated with the MAC address currently associated
// with the interface's IP addresses.
//
// IP addresses are recycled quickly when endpoints are dropped on network
// disconnect or container stop. A new MAC address may have been generated, so
// this is necessary to avoid packets sent to the old MAC address getting dropped
// until the ARP/Neighbour cache entries expire.
//
// Note that the kernel's arp_notify sysctl setting is not respected.
func (n *Namespace) advertiseAddrs(ctx context.Context, ifIndex int, i *Interface, nlh nlwrap.Handle, mcastRouteOk bool) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.advertiseAddrs.initial")
	defer span.End()

	mac := i.MacAddress()
	address4 := i.Address()
	address6 := i.AddressIPv6()
	ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
		"iface":        i.dstName,
		"ifi":          ifIndex,
		"mac":          mac.String(),
		"ip4":          address4,
		"ip6":          address6,
		"mcastRouteOk": mcastRouteOk,
	}))

	if address4 == nil && address6 == nil {
		// Nothing to do - for example, a bridge with no configured addresses.
		log.G(ctx).Debug("No IP addresses to advertise")
		return nil
	}
	if mac == nil {
		// Nothing to do - for example, a layer-3 ipvlan.
		log.G(ctx).Debug("No MAC address to advertise")
		return nil
	}
	if i.advertiseAddrNMsgs == 0 {
		log.G(ctx).Debug("Unsolicited ARP/NA is disabled")
		return nil
	}

	arpSender, naSender := n.prepAdvertiseAddrs(ctx, i, ifIndex)
	if arpSender == nil && naSender == nil {
		return nil
	}
	cleanup := func() {
		if arpSender != nil {
			arpSender.Close()
		}
		if naSender != nil {
			naSender.Close()
		}
	}
	stillSending := false
	defer func() {
		if !stillSending {
			cleanup()
		}
	}()

	send := func(ctx context.Context) error {
		link, err := nlh.LinkByIndex(ifIndex)
		if err != nil {
			return fmt.Errorf("failed to refresh link attributes: %w", err)
		}
		if curMAC := link.Attrs().HardwareAddr; !bytes.Equal(curMAC, mac) {
			log.G(ctx).WithFields(log.Fields{"newMAC": curMAC.String()}).Warn("MAC address changed")
			return fmt.Errorf("MAC address changed, got %s, expected %s", curMAC, mac.String())
		}
		log.G(ctx).Debug("Sending unsolicited ARP/NA")
		var errs []error
		if arpSender != nil {
			if err := arpSender.Send(); err != nil {
				log.G(ctx).WithError(err).Warn("Failed to send unsolicited ARP")
				errs = append(errs, err)
			}
		}
		if naSender != nil {
			if err := naSender.Send(); err != nil {
				log.G(ctx).WithError(err).Warn("Failed to send unsolicited NA")
				// If there was no multicast route and the network is unreachable, ignore the
				// error - this happens when a macvlan's parent interface is down.
				if mcastRouteOk || !errors.Is(err, unix.ENETUNREACH) {
					errs = append(errs, err)
				}
			}
		}
		return errors.Join(errs...)
	}

	// Send an initial message. If it fails, skip the resends.
	if err := send(ctx); err != nil {
		return err
	}
	if i.advertiseAddrNMsgs == 1 {
		return nil
	}
	// Don't clean up on return from this function, there are more ARPs/NAs to send.
	stillSending = true

	// Send the rest in the background.
	go func() {
		defer cleanup()
		ctx, span := otel.Tracer("").Start(trace.ContextWithSpanContext(context.WithoutCancel(ctx), trace.SpanContext{}),
			"libnetwork.osl.advertiseAddrs.subsequent",
			trace.WithLinks(trace.LinkFromContext(ctx)))
		defer span.End()
		ticker := time.NewTicker(i.advertiseAddrInterval)
		defer ticker.Stop()
		for c := range i.advertiseAddrNMsgs - 1 {
			select {
			case <-i.stopCh:
				log.G(ctx).Debug("Unsolicited ARP/NA sends cancelled")
				return
			case <-ticker.C:
				if send(log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{"n": c + 1}))) != nil {
					return
				}
			}
		}
	}()

	return nil
}

func (n *Namespace) prepAdvertiseAddrs(ctx context.Context, i *Interface, ifIndex int) (*l2disco.UnsolARP, *l2disco.UnsolNA) {
	var ua *l2disco.UnsolARP
	var un *l2disco.UnsolNA
	if err := n.InvokeFunc(func() {
		if address4 := i.Address(); address4 != nil {
			var err error
			ua, err = l2disco.NewUnsolARP(ctx, address4.IP, i.MacAddress(), ifIndex)
			if err != nil {
				log.G(ctx).WithError(err).Warn("Failed to prepare unsolicited ARP")
			}
		}
		if address6 := i.AddressIPv6(); address6 != nil {
			var err error
			un, err = l2disco.NewUnsolNA(ctx, address6.IP, i.MacAddress(), ifIndex)
			if err != nil {
				log.G(ctx).WithError(err).Warn("Failed to prepare unsolicited NA")
			}
		}
	}); err != nil {
		log.G(ctx).WithError(err).Warn("Failed to prepare unsolicited ARP/NA messages")
		return nil, nil
	}
	return ua, un
}

// RemoveInterface removes an interface from the namespace by renaming to
// original name and moving it out of the sandbox.
func (n *Namespace) RemoveInterface(i *Interface) error {
	close(i.stopCh)

	// Find the network interface identified by the DstName attribute.
	iface, err := n.nlHandle.LinkByName(i.DstName())
	if err != nil {
		return err
	}

	// Down the interface before configuring
	if err := n.nlHandle.LinkSetDown(iface); err != nil {
		return err
	}

	// TODO(aker): Why are we doing this? This would fail if the initial interface set up failed before the "dest interface" was moved into its own namespace; see https://github.com/moby/moby/pull/46315/commits/108595c2fe852a5264b78e96f9e63cda284990a6#r1331253578
	err = n.nlHandle.LinkSetName(iface, i.SrcName())
	if err != nil {
		log.G(context.TODO()).Debugf("LinkSetName failed for interface %s: %v", i.SrcName(), err)
		return err
	}

	// if it is a bridge just delete it.
	if i.Bridge() {
		if err := n.nlHandle.LinkDel(iface); err != nil {
			return fmt.Errorf("failed deleting bridge %q: %v", i.SrcName(), err)
		}
	} else if !n.isDefault {
		// Move the network interface to caller namespace.
		// TODO(aker): What's this really doing? There are no calls to LinkDel in this package: is this code really used? (Interface.Remove() has 3 callers); see https://github.com/moby/moby/pull/46315/commits/108595c2fe852a5264b78e96f9e63cda284990a6#r1331265335
		if err := n.nlHandle.LinkSetNsFd(iface, ns.ParseHandlerInt()); err != nil {
			log.G(context.TODO()).Debugf("LinkSetNsFd failed for interface %s: %v", i.SrcName(), err)
			return err
		}
	}

	n.mu.Lock()
	n.removeInterface(i)
	n.mu.Unlock()

	return nil
}

func (n *Namespace) removeInterface(i *Interface) {
	n.iFaces = slices.DeleteFunc(n.iFaces, func(iface *Interface) bool {
		return iface == i
	})
}

func (n *Namespace) configureInterface(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.configureInterface", trace.WithAttributes(
		attribute.String("ifaceName", iface.Attrs().Name)))
	defer span.End()

	ifaceName := iface.Attrs().Name
	ifaceConfigurators := []struct {
		Fn         func(context.Context, nlwrap.Handle, netlink.Link, *Interface) error
		ErrMessage string
	}{
		{setInterfaceMAC, fmt.Sprintf("error setting interface %q MAC to %q", ifaceName, i.MacAddress())},
		{setInterfaceIP, fmt.Sprintf("error setting interface %q IP to %v", ifaceName, i.Address())},
		{setInterfaceIPv6, fmt.Sprintf("error setting interface %q IPv6 to %v", ifaceName, i.AddressIPv6())},
		{setInterfaceMaster, fmt.Sprintf("error setting interface %q master to %q", ifaceName, i.DstMaster())},
		{setInterfaceLinkLocalIPs, fmt.Sprintf("error setting interface %q link local IPs to %v", ifaceName, i.LinkLocalAddresses())},
	}

	for _, config := range ifaceConfigurators {
		if err := config.Fn(ctx, nlh, iface, i); err != nil {
			return fmt.Errorf("%s: %v", config.ErrMessage, err)
		}
	}

	if err := n.setSysctls(ctx, i.dstName, i.sysctls); err != nil {
		return err
	}

	return nil
}

func setInterfaceMaster(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	if i.DstMaster() == "" {
		return nil
	}

	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceMaster", trace.WithAttributes(
		attribute.String("i.SrcName", i.SrcName()),
		attribute.String("i.DstName", i.DstName())))
	defer span.End()

	return nlh.LinkSetMaster(iface, &netlink.Bridge{
		LinkAttrs: netlink.LinkAttrs{Name: i.DstMaster()},
	})
}

func setInterfaceMAC(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	if i.MacAddress() == nil {
		return nil
	}

	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceMAC", trace.WithAttributes(
		attribute.String("i.SrcName", i.SrcName()),
		attribute.String("i.DstName", i.DstName())))
	defer span.End()

	return nlh.LinkSetHardwareAddr(iface, i.MacAddress())
}

func setInterfaceIP(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	if i.Address() == nil {
		return nil
	}

	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceIP", trace.WithAttributes(
		attribute.String("i.SrcName", i.SrcName()),
		attribute.String("i.DstName", i.DstName())))
	defer span.End()

	if err := checkRouteConflict(nlh, i.Address(), netlink.FAMILY_V4); err != nil {
		return err
	}
	ipAddr := &netlink.Addr{IPNet: i.Address(), Label: ""}
	return nlh.AddrAdd(iface, ipAddr)
}

func setInterfaceIPv6(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	addr := i.AddressIPv6()
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceIPv6", trace.WithAttributes(
		attribute.String("i.SrcName", i.SrcName()),
		attribute.String("i.DstName", i.DstName()),
		attribute.String("i.AddressIPv6", addr.String())))
	defer span.End()

	// IPv6 must be enabled on the interface if and only if the network is
	// IPv6-enabled. For an interface on an IPv4-only network, if IPv6 isn't
	// disabled, the interface will be put into IPv6 multicast groups making
	// it unexpectedly susceptible to NDP cache poisoning, route injection, etc.
	// (At present, there will always be a pre-configured IPv6 address if the
	// network is IPv6-enabled.)
	if err := setIPv6(i.ns.path, i.DstName(), addr != nil); err != nil {
		return fmt.Errorf("failed to configure ipv6: %v", err)
	}
	if addr == nil {
		return nil
	}
	if err := checkRouteConflict(nlh, addr, netlink.FAMILY_V6); err != nil {
		return err
	}
	nlAddr := &netlink.Addr{IPNet: addr, Label: "", Flags: syscall.IFA_F_NODAD}
	return nlh.AddrAdd(iface, nlAddr)
}

func setInterfaceLinkLocalIPs(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceLinkLocalIPs", trace.WithAttributes(
		attribute.String("i.SrcName", i.SrcName()),
		attribute.String("i.DstName", i.DstName())))
	defer span.End()

	for _, llIP := range i.LinkLocalAddresses() {
		ipAddr := &netlink.Addr{IPNet: llIP}
		if err := nlh.AddrAdd(iface, ipAddr); err != nil {
			return err
		}
	}
	return nil
}

func (n *Namespace) setSysctls(ctx context.Context, ifName string, sysctls []string) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setSysctls", trace.WithAttributes(
		attribute.String("ifName", ifName)))
	defer span.End()

	for _, sc := range sysctls {
		k, v, found := strings.Cut(sc, "=")
		if !found {
			return fmt.Errorf("expected sysctl '%s' to have format name=value", sc)
		}
		sk := strings.Split(k, ".")
		if len(sk) != 5 {
			return fmt.Errorf("expected sysctl '%s' to have format net.X.Y.IFNAME.Z", sc)
		}

		sysPath := filepath.Join(append([]string{"/proc/sys", sk[0], sk[1], sk[2], ifName}, sk[4:]...)...)
		var errF error
		f := func() {
			if fi, err := os.Stat(sysPath); err != nil || !fi.Mode().IsRegular() {
				errF = fmt.Errorf("%s is not a sysctl file", sysPath)
			} else if curVal, err := os.ReadFile(sysPath); err != nil {
				errF = fmt.Errorf("unable to read '%s': %w", sysPath, err)
			} else if strings.TrimSpace(string(curVal)) == v {
				// The value is already correct, don't try to write the file in case
				// "/proc/sys/net" is a read-only filesystem.
			} else if err := os.WriteFile(sysPath, []byte(v), 0o644); err != nil {
				errF = fmt.Errorf("unable to write to '%s': %w", sysPath, err)
			}
		}

		if err := n.InvokeFunc(f); err != nil {
			return fmt.Errorf("failed to run sysctl setter in network namespace: %w", err)
		}
		if errF != nil {
			return errF
		}
	}
	return nil
}

func setInterfaceName(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceName", trace.WithAttributes(
		attribute.String("ifaceName", iface.Attrs().Name)))
	defer span.End()

	return nlh.LinkSetName(iface, i.DstName())
}

func setInterfaceRoutes(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
	ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceRoutes", trace.WithAttributes(
		attribute.String("i.SrcName", i.SrcName()),
		attribute.String("i.DstName", i.DstName())))
	defer span.End()

	for _, route := range i.Routes() {
		if route.IP.IsUnspecified() {
			// Don't set up a default route now, it'll be set later if this interface is
			// selected as the default gateway.
			continue
		}
		if err := nlh.RouteAdd(&netlink.Route{
			Scope:     netlink.SCOPE_LINK,
			LinkIndex: iface.Attrs().Index,
			Dst:       route,
		}); err != nil {
			return err
		}
	}
	return nil
}

func checkRouteConflict(nlh nlwrap.Handle, address *net.IPNet, family int) error {
	routes, err := nlh.RouteList(nil, family)
	if err != nil {
		return err
	}
	for _, route := range routes {
		if route.Dst != nil && !route.Dst.IP.IsUnspecified() {
			if route.Dst.Contains(address.IP) || address.Contains(route.Dst.IP) {
				return fmt.Errorf("cannot program address %v in sandbox interface because it conflicts with existing route %s",
					address, route)
			}
		}
	}
	return nil
}
