package osl

import (
	"context"
	"errors"
	"fmt"
	"net"
	"os"
	"path/filepath"
	"runtime"
	"strconv"
	"strings"
	"sync"
	"syscall"
	"time"

	"github.com/containerd/log"
	"github.com/docker/docker/internal/unshare"
	"github.com/docker/docker/libnetwork/ns"
	"github.com/docker/docker/libnetwork/osl/kernel"
	"github.com/docker/docker/libnetwork/types"
	"github.com/vishvananda/netlink"
	"github.com/vishvananda/netlink/nl"
	"github.com/vishvananda/netns"
	"golang.org/x/sys/unix"
)

const defaultPrefix = "/var/run/docker"

func init() {
	// Lock main() to the initial thread to exclude the goroutines spawned
	// by func (*Namespace) InvokeFunc() or func setIPv6() below from
	// being scheduled onto that thread. Changes to the network namespace of
	// the initial thread alter /proc/self/ns/net, which would break any
	// code which (incorrectly) assumes that the file is the network
	// namespace for the thread it is currently executing on.
	runtime.LockOSThread()
}

var (
	once             sync.Once
	garbagePathMap   = make(map[string]bool)
	gpmLock          sync.Mutex
	gpmWg            sync.WaitGroup
	gpmCleanupPeriod = 60 * time.Second
	gpmChan          = make(chan chan struct{})
	netnsBasePath    = filepath.Join(defaultPrefix, "netns")
)

// SetBasePath sets the base url prefix for the ns path
func SetBasePath(path string) {
	netnsBasePath = filepath.Join(path, "netns")
}

func basePath() string {
	return netnsBasePath
}

func createBasePath() {
	err := os.MkdirAll(basePath(), 0o755)
	if err != nil {
		panic("Could not create net namespace path directory")
	}

	// Start the garbage collection go routine
	go removeUnusedPaths()
}

func removeUnusedPaths() {
	gpmLock.Lock()
	period := gpmCleanupPeriod
	gpmLock.Unlock()

	ticker := time.NewTicker(period)
	for {
		var (
			gc   chan struct{}
			gcOk bool
		)

		select {
		case <-ticker.C:
		case gc, gcOk = <-gpmChan:
		}

		gpmLock.Lock()
		pathList := make([]string, 0, len(garbagePathMap))
		for path := range garbagePathMap {
			pathList = append(pathList, path)
		}
		garbagePathMap = make(map[string]bool)
		gpmWg.Add(1)
		gpmLock.Unlock()

		for _, path := range pathList {
			os.Remove(path)
		}

		gpmWg.Done()
		if gcOk {
			close(gc)
		}
	}
}

func addToGarbagePaths(path string) {
	gpmLock.Lock()
	garbagePathMap[path] = true
	gpmLock.Unlock()
}

func removeFromGarbagePaths(path string) {
	gpmLock.Lock()
	delete(garbagePathMap, path)
	gpmLock.Unlock()
}

// GC triggers garbage collection of namespace path right away
// and waits for it.
func GC() {
	gpmLock.Lock()
	if len(garbagePathMap) == 0 {
		// No need for GC if map is empty
		gpmLock.Unlock()
		return
	}
	gpmLock.Unlock()

	// if content exists in the garbage paths
	// we can trigger GC to run, providing a
	// channel to be notified on completion
	waitGC := make(chan struct{})
	gpmChan <- waitGC
	// wait for GC completion
	<-waitGC
}

// GenerateKey generates a sandbox key based on the passed
// container id.
func GenerateKey(containerID string) string {
	maxLen := 12
	// Read sandbox key from host for overlay
	if strings.HasPrefix(containerID, "-") {
		var (
			index    int
			indexStr string
			tmpkey   string
		)
		dir, err := os.ReadDir(basePath())
		if err != nil {
			return ""
		}

		for _, v := range dir {
			id := v.Name()
			if strings.HasSuffix(id, containerID[:maxLen-1]) {
				indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
				tmpindex, err := strconv.Atoi(indexStr)
				if err != nil {
					return ""
				}
				if tmpindex > index {
					index = tmpindex
					tmpkey = id
				}
			}
		}
		containerID = tmpkey
		if containerID == "" {
			return ""
		}
	}

	if len(containerID) < maxLen {
		maxLen = len(containerID)
	}

	return basePath() + "/" + containerID[:maxLen]
}

// NewSandbox provides a new Namespace instance created in an os specific way
// provided a key which uniquely identifies the sandbox.
func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) {
	if !isRestore {
		err := createNetworkNamespace(key, osCreate)
		if err != nil {
			return nil, err
		}
	} else {
		once.Do(createBasePath)
	}

	n := &Namespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}

	sboxNs, err := netns.GetFromPath(n.path)
	if err != nil {
		return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
	}
	defer sboxNs.Close()

	n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
	if err != nil {
		return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
	}

	err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
	if err != nil {
		log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
	}

	if err = n.loopbackUp(); err != nil {
		n.nlHandle.Close()
		return nil, err
	}

	return n, nil
}

func mountNetworkNamespace(basePath string, lnPath string) error {
	err := syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
	if err != nil {
		return fmt.Errorf("bind-mount %s -> %s: %w", basePath, lnPath, err)
	}
	return nil
}

// GetSandboxForExternalKey returns sandbox object for the supplied path
func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) {
	if err := createNamespaceFile(key); err != nil {
		return nil, err
	}

	if err := mountNetworkNamespace(basePath, key); err != nil {
		return nil, err
	}
	n := &Namespace{path: key, nextIfIndex: make(map[string]int)}

	sboxNs, err := netns.GetFromPath(n.path)
	if err != nil {
		return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
	}
	defer sboxNs.Close()

	n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
	if err != nil {
		return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
	}

	err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
	if err != nil {
		log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
	}

	if err = n.loopbackUp(); err != nil {
		n.nlHandle.Close()
		return nil, err
	}

	return n, nil
}

func createNetworkNamespace(path string, osCreate bool) error {
	if err := createNamespaceFile(path); err != nil {
		return err
	}

	do := func() error {
		return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
	}
	if osCreate {
		return unshare.Go(unix.CLONE_NEWNET, do, nil)
	}
	return do()
}

func unmountNamespaceFile(path string) {
	if _, err := os.Stat(path); err != nil {
		// ignore when we cannot stat the path
		return
	}
	if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
		log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
	}
}

func createNamespaceFile(path string) error {
	once.Do(createBasePath)
	// Remove it from garbage collection list if present
	removeFromGarbagePaths(path)

	// If the path is there unmount it first
	unmountNamespaceFile(path)

	// wait for garbage collection to complete if it is in progress
	// before trying to create the file.
	//
	// TODO(aker): This garbage-collection was for a kernel bug in kernels 3.18-4.0.1: is this still needed on current kernels (and on kernel 3.10)? see https://github.com/moby/moby/pull/46315/commits/c0a6beba8e61d4019e1806d5241ba22007072ca2#r1331327103
	gpmWg.Wait()

	f, err := os.Create(path)
	if err != nil {
		return err
	}
	_ = f.Close()
	return nil
}

// Namespace represents a network sandbox. It represents a Linux network
// namespace, and moves an interface into it when called on method AddInterface
// or sets the gateway etc. It holds a list of Interfaces, routes etc., and more
// can be added dynamically.
type Namespace struct {
	path                string
	iFaces              []*Interface
	gw                  net.IP
	gwv6                net.IP
	staticRoutes        []*types.StaticRoute
	neighbors           []*neigh
	nextIfIndex         map[string]int
	isDefault           bool
	ipv6LoEnabledOnce   sync.Once
	ipv6LoEnabledCached bool
	nlHandle            *netlink.Handle
	mu                  sync.Mutex
}

// Interfaces returns the collection of Interface previously added with the AddInterface
// method. Note that this doesn't include network interfaces added in any
// other way (such as the default loopback interface which is automatically
// created on creation of a sandbox).
func (n *Namespace) Interfaces() []*Interface {
	ifaces := make([]*Interface, len(n.iFaces))
	copy(ifaces, n.iFaces)
	return ifaces
}

func (n *Namespace) loopbackUp() error {
	iface, err := n.nlHandle.LinkByName("lo")
	if err != nil {
		return err
	}
	return n.nlHandle.LinkSetUp(iface)
}

// GetLoopbackIfaceName returns the name of the loopback interface
func (n *Namespace) GetLoopbackIfaceName() string {
	return "lo"
}

// AddAliasIP adds the passed IP address to the named interface
func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error {
	iface, err := n.nlHandle.LinkByName(ifName)
	if err != nil {
		return err
	}
	return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
}

// RemoveAliasIP removes the passed IP address from the named interface
func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
	iface, err := n.nlHandle.LinkByName(ifName)
	if err != nil {
		return err
	}
	return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
}

// DisableARPForVIP disables ARP replies and requests for VIP addresses
// on a particular interface.
func (n *Namespace) DisableARPForVIP(srcName string) (Err error) {
	dstName := ""
	for _, i := range n.Interfaces() {
		if i.SrcName() == srcName {
			dstName = i.DstName()
			break
		}
	}
	if dstName == "" {
		return fmt.Errorf("failed to find interface %s in sandbox", srcName)
	}

	err := n.InvokeFunc(func() {
		path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
		if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
			Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
			return
		}
		path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
		if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
			Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
			return
		}
	})
	if err != nil {
		return err
	}
	return
}

// InvokeFunc invoke a function in the network namespace.
func (n *Namespace) InvokeFunc(f func()) error {
	path := n.nsPath()
	newNS, err := netns.GetFromPath(path)
	if err != nil {
		return fmt.Errorf("failed get network namespace %q: %w", path, err)
	}
	defer newNS.Close()

	done := make(chan error, 1)
	go func() {
		runtime.LockOSThread()
		// InvokeFunc() could have been called from a goroutine with
		// tampered thread state, e.g. from another InvokeFunc()
		// callback. The outer goroutine's thread state cannot be
		// trusted.
		origNS, err := netns.Get()
		if err != nil {
			runtime.UnlockOSThread()
			done <- fmt.Errorf("failed to get original network namespace: %w", err)
			return
		}
		defer origNS.Close()

		if err := netns.Set(newNS); err != nil {
			runtime.UnlockOSThread()
			done <- err
			return
		}
		defer func() {
			close(done)
			if err := netns.Set(origNS); err != nil {
				log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
				// Recover from the error by leaving this goroutine locked to
				// the thread. The runtime will terminate the thread and replace
				// it with a clean one when this goroutine returns.
			} else {
				runtime.UnlockOSThread()
			}
		}()
		f()
	}()
	return <-done
}

func (n *Namespace) nsPath() string {
	n.mu.Lock()
	defer n.mu.Unlock()

	return n.path
}

// Key returns the path where the network namespace is mounted.
func (n *Namespace) Key() string {
	return n.path
}

// Destroy destroys the sandbox.
func (n *Namespace) Destroy() error {
	if n.nlHandle != nil {
		n.nlHandle.Close()
	}
	// Assuming no running process is executing in this network namespace,
	// unmounting is sufficient to destroy it.
	if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
		return err
	}

	// Stash it into the garbage collection list
	addToGarbagePaths(n.path)
	return nil
}

// Restore restores the network namespace.
func (n *Namespace) Restore(interfaces map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
	// restore interfaces
	for iface, opts := range interfaces {
		i, err := newInterface(n, iface.SrcName, iface.DstPrefix, opts...)
		if err != nil {
			return err
		}
		if n.isDefault {
			i.dstName = i.srcName
		} else {
			links, err := n.nlHandle.LinkList()
			if err != nil {
				return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
			}
			// due to the docker network connect/disconnect, so the dstName should
			// restore from the namespace
			for _, link := range links {
				ifaceName := link.Attrs().Name
				if i.dstName == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") {
					i.dstName = ifaceName
					break
				}
				// find the interface name by ip
				if i.address != nil {
					addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
					if err != nil {
						return err
					}
					for _, addr := range addresses {
						if addr.IPNet.String() == i.address.String() {
							i.dstName = ifaceName
							break
						}
					}
					if i.dstName == ifaceName {
						break
					}
				}
				// This is to find the interface name of the pair in overlay sandbox
				if i.master != "" && i.dstName == "veth" && strings.HasPrefix(ifaceName, "veth") {
					i.dstName = ifaceName
				}
			}

			var index int
			if idx := strings.TrimPrefix(i.dstName, iface.DstPrefix); idx != "" {
				index, err = strconv.Atoi(idx)
				if err != nil {
					return fmt.Errorf("failed to restore interface in network namespace %q: invalid dstName for interface: %s: %v", n.path, i.dstName, err)
				}
			}
			index++
			n.mu.Lock()
			if index > n.nextIfIndex[iface.DstPrefix] {
				n.nextIfIndex[iface.DstPrefix] = index
			}
			n.iFaces = append(n.iFaces, i)
			n.mu.Unlock()
		}
	}

	// restore routes and gateways
	n.mu.Lock()
	n.staticRoutes = append(n.staticRoutes, routes...)
	if len(gw) > 0 {
		n.gw = gw
	}
	if len(gw6) > 0 {
		n.gwv6 = gw6
	}
	n.mu.Unlock()
	return nil
}

// IPv6LoEnabled returns true if the loopback interface had an IPv6 address when
// last checked. It's always checked on the first call, and by RefreshIPv6LoEnabled.
// ('::1' is assigned by the kernel if IPv6 is enabled.)
func (n *Namespace) IPv6LoEnabled() bool {
	n.ipv6LoEnabledOnce.Do(func() {
		n.RefreshIPv6LoEnabled()
	})
	n.mu.Lock()
	defer n.mu.Unlock()
	return n.ipv6LoEnabledCached
}

// RefreshIPv6LoEnabled refreshes the cached result returned by IPv6LoEnabled.
func (n *Namespace) RefreshIPv6LoEnabled() {
	n.mu.Lock()
	defer n.mu.Unlock()

	// If anything goes wrong, assume no-IPv6.
	n.ipv6LoEnabledCached = false
	iface, err := n.nlHandle.LinkByName("lo")
	if err != nil {
		log.G(context.TODO()).WithError(err).Warn("Unable to find 'lo' to determine IPv6 support")
		return
	}
	addrs, err := n.nlHandle.AddrList(iface, nl.FAMILY_V6)
	if err != nil {
		log.G(context.TODO()).WithError(err).Warn("Unable to get 'lo' addresses to determine IPv6 support")
		return
	}
	n.ipv6LoEnabledCached = len(addrs) > 0
}

// ApplyOSTweaks applies operating system specific knobs on the sandbox.
func (n *Namespace) ApplyOSTweaks(types []SandboxType) {
	for _, t := range types {
		switch t {
		case SandboxTypeLoadBalancer, SandboxTypeIngress:
			kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
				// disables any special handling on port reuse of existing IPVS connection table entries
				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
				"net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
				// expires connection from the IPVS connection table when the backend is not available
				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
				"net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
				// expires persistent connections to destination servers with weights set to 0
				// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
				"net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
			})
		}
	}
}

func setIPv6(nspath, iface string, enable bool) error {
	errCh := make(chan error, 1)
	go func() {
		defer close(errCh)

		namespace, err := netns.GetFromPath(nspath)
		if err != nil {
			errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
			return
		}
		defer namespace.Close()

		runtime.LockOSThread()

		origNS, err := netns.Get()
		if err != nil {
			runtime.UnlockOSThread()
			errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
			return
		}
		defer origNS.Close()

		if err = netns.Set(namespace); err != nil {
			runtime.UnlockOSThread()
			errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
			return
		}
		defer func() {
			if err := netns.Set(origNS); err != nil {
				log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
				// The error is only fatal for the current thread. Keep this
				// goroutine locked to the thread to make the runtime replace it
				// with a clean thread once this goroutine returns.
			} else {
				runtime.UnlockOSThread()
			}
		}()

		path := "/proc/sys/net/ipv6/conf/" + iface + "/disable_ipv6"
		value := byte('1')
		if enable {
			value = '0'
		}

		if curVal, err := os.ReadFile(path); err != nil {
			if os.IsNotExist(err) {
				if enable {
					log.G(context.TODO()).WithError(err).Warn("Cannot enable IPv6 on container interface. Has IPv6 been disabled in this node's kernel?")
				} else {
					log.G(context.TODO()).WithError(err).Debug("Not disabling IPv6 on container interface. Has IPv6 been disabled in this node's kernel?")
				}
				return
			}
			errCh <- err
			return
		} else if len(curVal) > 0 && curVal[0] == value {
			// Nothing to do, the setting is already correct.
			return
		}

		if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil || os.Getenv("DOCKER_TEST_RO_DISABLE_IPV6") != "" {
			logger := log.G(context.TODO()).WithFields(log.Fields{
				"error":     err,
				"interface": iface,
			})
			if enable {
				// The user asked for IPv6 on the interface, and we can't give it to them.
				// But, in line with the IsNotExist case above, just log.
				logger.Warn("Cannot enable IPv6 on container interface, continuing.")
			} else {
				logger.Error("Cannot disable IPv6 on container interface.")
				errCh <- errors.New("failed to disable IPv6 on container's interface " + iface)
			}
			return
		}
	}()
	return <-errCh
}
